diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 5e44f716812e5..65adb0bbd9004 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -231,6 +231,11 @@ Changelog scoring="max_error" which is now deprecated. :pr:`29462` by :user:`Farid "Freddie" Taba `. +- |API| the `assert_all_finite` parameter of functions + :func:`metrics.pairwise.check_pairwise_arrays` and :func:`metrics.pairwise_distances` + is renamed into `ensure_all_finite`. `force_all_finite` will be removed in 1.8. + :pr:`29404` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.model_selection` .............................. @@ -272,6 +277,14 @@ Changelog traversed. :pr:`27966` by :user:`Adam Li `. +:mod:`sklearn.utils` +.................... + +- |API| the `assert_all_finite` parameter of functions :func:`utils.check_array`, + :func:`utils.check_X_y`, :func:`utils.as_float_array` is renamed into + `ensure_all_finite`. `force_all_finite` will be removed in 1.8. + :pr:`29404` by :user:`Jérémie du Boisberranger `. + .. rubric:: Code and documentation contributors Thanks to everyone who has contributed to the maintenance and improvement of diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 6578f53745f45..58e3febddc652 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -737,7 +737,7 @@ def fit(self, X, y=None): X = self._validate_data( X, accept_sparse=["csr", "lil"], - force_all_finite=False, + ensure_all_finite=False, dtype=np.float64, ) self._raw_data = X @@ -782,7 +782,7 @@ def fit(self, X, y=None): # Perform data validation after removing infinite values (numpy.inf) # from the given distance matrix. X = self._validate_data( - X, force_all_finite=False, dtype=np.float64, force_writeable=True + X, ensure_all_finite=False, dtype=np.float64, force_writeable=True ) if np.isnan(X).any(): # TODO: Support np.nan in Cython implementation for precomputed diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c4a3ab67849ea..2f8c28d4b02b9 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1137,7 +1137,7 @@ def _hstack(self, Xs, *, n_samples): # in a sparse matrix, `check_array` is used for the # dtype conversion if necessary. converted_Xs = [ - check_array(X, accept_sparse=True, force_all_finite=False) + check_array(X, accept_sparse=True, ensure_all_finite=False) for X in Xs ] except ValueError as e: @@ -1325,7 +1325,7 @@ def _check_X(X): """Use check_array only when necessary, e.g. on lists and other non-array-likes.""" if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X): return X - return check_array(X, force_all_finite="allow-nan", dtype=object) + return check_array(X, ensure_all_finite="allow-nan", dtype=object) def _is_empty_column_selection(column): diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index c2ef2572ab786..476bad9f1e9ed 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -262,7 +262,7 @@ def fit(self, X, y, **fit_params): y, input_name="y", accept_sparse=False, - force_all_finite=True, + ensure_all_finite=True, ensure_2d=False, dtype="numeric", allow_nd=True, diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 1e194857c64a2..bd2364ab472bb 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -90,7 +90,7 @@ def empirical_covariance(X, *, assume_centered=False): [0.25, 0.25, 0.25], [0.25, 0.25, 0.25]]) """ - X = check_array(X, ensure_2d=False, force_all_finite=False) + X = check_array(X, ensure_2d=False, ensure_all_finite=False) if X.ndim == 1: X = np.reshape(X, (1, -1)) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 81d3163556626..24bbda275c12d 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -391,7 +391,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): y, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, multi_output=True, ) @@ -941,7 +941,7 @@ def predict_proba(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) @@ -991,7 +991,7 @@ def predict_log_proba(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) @@ -1046,7 +1046,7 @@ def decision_function(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) @@ -1279,7 +1279,7 @@ def predict(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d55a0c645e929..cd09c7571a33e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -362,7 +362,7 @@ def fit(self, X, y, sample_weight=None): multi_output=True, accept_sparse="csc", dtype=DTYPE, - force_all_finite=False, + ensure_all_finite=False, ) # _compute_missing_values_in_feature_mask checks if X has missing values and # will raise an error if the underlying tree base estimator can't handle missing @@ -630,16 +630,16 @@ def _validate_X_predict(self, X): Validate X whenever one tries to predict, apply, predict_proba.""" check_is_fitted(self) if self.estimators_[0]._support_missing_values(X): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True X = self._validate_data( X, dtype=DTYPE, accept_sparse="csr", reset=False, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based sparse matrices") diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index c280ca695bcfd..63792f2c44975 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -773,7 +773,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): dtype=DTYPE, order="C", accept_sparse="csr", - force_all_finite=False, + ensure_all_finite=False, ) raw_predictions = self._raw_predict(X_train) self._resize_state() diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index ed1bca8558f81..c428c742af883 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -194,7 +194,7 @@ def fit(self, X, y=None): ) ) - X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) + X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False) max_bins = self.n_bins - 1 rng = check_random_state(self.random_state) @@ -275,7 +275,7 @@ def transform(self, X): X_binned : array-like of shape (n_samples, n_features) The binned data (fortran-aligned). """ - X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) + X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False) check_is_fitted(self) if X.shape[1] != self.n_bins_non_missing_.shape[0]: raise ValueError( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 8db6f7e4d5ff4..c0b79119bfedf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -263,7 +263,7 @@ def _preprocess_X(self, X, *, reset): """ # If there is a preprocessor, we let the preprocessor handle the validation. # Otherwise, we validate the data ourselves. - check_X_kwargs = dict(dtype=[X_DTYPE], force_all_finite=False) + check_X_kwargs = dict(dtype=[X_DTYPE], ensure_all_finite=False) if not reset: if self._preprocessor is None: return self._validate_data(X, reset=False, **check_X_kwargs) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index bbdcb38ef013a..24b5b02aa0696 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -12,6 +12,10 @@ from sklearn.model_selection import train_test_split +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize( "loss", @@ -118,6 +122,10 @@ def test_same_predictions_regression( assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-4)) > 1 - 0.01 +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("min_samples_leaf", (1, 20)) @pytest.mark.parametrize( @@ -191,6 +199,10 @@ def test_same_predictions_classification( np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("min_samples_leaf", (1, 20)) @pytest.mark.parametrize( diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index b8df148f20a1a..f4694d69e2dbb 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -316,7 +316,7 @@ def fit(self, X, y=None, sample_weight=None): Fitted estimator. """ X = self._validate_data( - X, accept_sparse=["csc"], dtype=tree_dtype, force_all_finite=False + X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False ) if issparse(X): # Pre-sort indices to avoid that each individual tree of the @@ -522,7 +522,7 @@ def score_samples(self, X): accept_sparse="csr", dtype=tree_dtype, reset=False, - force_all_finite=False, + ensure_all_finite=False, ) return self._score_samples(X) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 12553403f4b90..7504f7345a264 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -103,7 +103,7 @@ def transform(self, X): X, dtype=None, accept_sparse="csr", - force_all_finite=not _safe_tags(self, key="allow_nan"), + ensure_all_finite=not _safe_tags(self, key="allow_nan"), cast_to_ndarray=not preserve_X, reset=False, ) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 45b132cf6e460..524c791be6989 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -274,7 +274,7 @@ def _fit(self, X, y, step_score=None, **fit_params): y, accept_sparse="csc", ensure_min_features=2, - force_all_finite=False, + ensure_all_finite=False, multi_output=True, ) @@ -725,7 +725,7 @@ def fit(self, X, y, groups=None): y, accept_sparse="csr", ensure_min_features=2, - force_all_finite=False, + ensure_all_finite=False, multi_output=True, ) diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 471f9a373a3da..b3216d12f6328 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -214,7 +214,7 @@ def fit(self, X, y=None): X, accept_sparse="csc", ensure_min_features=2, - force_all_finite=not tags.get("allow_nan", True), + ensure_all_finite=not tags.get("allow_nan", True), ) n_features = X.shape[1] diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 7494b72c1acb8..bacce97f1250f 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -101,7 +101,7 @@ def fit(self, X, y=None): X, accept_sparse=("csr", "csc"), dtype=np.float64, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if hasattr(X, "toarray"): # sparse matrix diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 6109e3fde7b2a..5674bdc5ba85a 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -323,9 +323,9 @@ def _validate_input(self, X, in_fit): dtype = self._fit_dtype if is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True try: X = self._validate_data( @@ -334,7 +334,7 @@ def _validate_input(self, X, in_fit): accept_sparse="csc", dtype=dtype, force_writeable=True if not in_fit else None, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, copy=self.copy, ) except ValueError as ve: @@ -893,15 +893,15 @@ def _get_missing_features_info(self, X): def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): - force_all_finite = True + ensure_all_finite = True else: - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" X = self._validate_data( X, reset=in_fit, accept_sparse=("csc", "csr"), dtype=None, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index e2d06844611c9..e753e0d420ed2 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -614,16 +614,16 @@ def _initial_imputation(self, X, in_fit=False): number of features. """ if is_scalar_nan(self.missing_values): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True X = self._validate_data( X, dtype=FLOAT_DTYPES, order="F", reset=in_fit, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) _check_inputs_dtype(X, self.missing_values) @@ -680,7 +680,7 @@ def _validate_limit(limit, limit_type, n_features): limit = limit_bound if limit is None else limit if np.isscalar(limit): limit = np.full(n_features, limit) - limit = check_array(limit, force_all_finite=False, copy=False, ensure_2d=False) + limit = check_array(limit, ensure_all_finite=False, copy=False, ensure_2d=False) if not limit.shape[0] == n_features: raise ValueError( f"'{limit_type}_value' should be of " diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 2e18246b4b9bb..f22e1de79cd85 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -225,15 +225,15 @@ def fit(self, X, y=None): """ # Check data integrity and calling arguments if not is_scalar_nan(self.missing_values): - force_all_finite = True + ensure_all_finite = True else: - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" X = self._validate_data( X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, copy=self.copy, ) @@ -262,15 +262,15 @@ def transform(self, X): check_is_fitted(self) if not is_scalar_nan(self.missing_values): - force_all_finite = True + ensure_all_finite = True else: - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" X = self._validate_data( X, accept_sparse=False, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, copy=self.copy, reset=False, ) @@ -363,7 +363,7 @@ def process_chunk(dist_chunk, start): self._fit_X, metric=self.metric, missing_values=self.missing_values, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, reduce_func=process_chunk, ) for chunk in gen: diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index b74258615a447..913976b2544e3 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -566,7 +566,7 @@ def partial_dependence( # Use check_array only on lists and other non-array-likes / sparse. Do not # convert DataFrame into a NumPy array. if not (hasattr(X, "__array__") or sparse.issparse(X)): - X = check_array(X, force_all_finite="allow-nan", dtype=object) + X = check_array(X, ensure_all_finite="allow-nan", dtype=object) if is_regressor(estimator) and response_method != "auto": raise ValueError( diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 8ecd7237b077d..fb3c646a271a6 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -266,7 +266,7 @@ def permutation_importance( array([0.2211..., 0. , 0. ]) """ if not hasattr(X, "iloc"): - X = check_array(X, force_all_finite="allow-nan", dtype=None) + X = check_array(X, ensure_all_finite="allow-nan", dtype=None) # Precompute random seed from the random state to be used # to get a fresh independent RandomState instance for each diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index ecb421ccdd68a..02abe62dbb2c1 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -543,7 +543,7 @@ def from_estimator( # Use check_array only on lists and other non-array-likes / sparse. Do not # convert DataFrame into a NumPy array. if not (hasattr(X, "__array__") or sparse.issparse(X)): - X = check_array(X, force_all_finite="allow-nan", dtype=object) + X = check_array(X, ensure_all_finite="allow-nan", dtype=object) n_features = X.shape[1] feature_names = _check_feature_names(X, feature_names) diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index eb4e52be2656c..cf051228d210a 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -1056,7 +1056,7 @@ def fit(self, X, y, **fit_params): _raise_for_params(fit_params, self, "fit") X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2) - X = as_float_array(X, copy=False, force_all_finite=False) + X = as_float_array(X, copy=False, ensure_all_finite=False) cv = check_cv(self.cv, classifier=False) if _routing_enabled(): routed_params = process_routing(self, "fit", **fit_params) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 631a2c1c66815..6943894002c7c 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -363,7 +363,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): # because that would allow y to be csr. Delay expensive finiteness # check to the estimator's own input validation. _raise_for_params(fit_params, self, "fit") - check_X_params = dict(accept_sparse="csr", force_all_finite=False) + check_X_params = dict(accept_sparse="csr", ensure_all_finite=False) check_y_params = dict(ensure_2d=False) X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) @@ -630,7 +630,7 @@ def predict(self, X, **params): check_is_fitted(self) X = self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, accept_sparse=True, reset=False, ) @@ -678,7 +678,7 @@ def score(self, X, y, **params): check_is_fitted(self) X = self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, accept_sparse=True, reset=False, ) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f8b163813d6d6..745767f23e818 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -42,6 +42,7 @@ StrOptions, validate_params, ) +from ..utils.deprecation import _deprecate_force_all_finite from ..utils.extmath import row_norms, safe_sparse_dot from ..utils.fixes import parse_version, sp_base_version from ..utils.parallel import Parallel, delayed @@ -82,7 +83,8 @@ def check_pairwise_arrays( precomputed=False, dtype="infer_float", accept_sparse="csr", - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=None, ensure_2d=True, copy=False, ): @@ -138,6 +140,22 @@ def check_pairwise_arrays( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan`. + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + ensure_2d : bool, default=True Whether to raise an error when the input arrays are not 2-dimensional. Setting this to `False` is necessary when using a custom metric with certain @@ -160,6 +178,8 @@ def check_pairwise_arrays( An array equal to Y if Y was not None, guaranteed to be a numpy array. If Y was None, safe_Y will be a pointer to X. """ + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + xp, _ = get_namespace(X, Y) if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp): X, Y, dtype_float = _return_float_dtype(X, Y) @@ -176,7 +196,7 @@ def check_pairwise_arrays( accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, estimator=estimator, ensure_2d=ensure_2d, ) @@ -186,7 +206,7 @@ def check_pairwise_arrays( accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, estimator=estimator, ensure_2d=ensure_2d, ) @@ -195,7 +215,7 @@ def check_pairwise_arrays( accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, estimator=estimator, ensure_2d=ensure_2d, ) @@ -514,9 +534,9 @@ def nan_euclidean_distances( [1.41421356]]) """ - force_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True + ensure_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True X, Y = check_pairwise_arrays( - X, Y, accept_sparse=False, force_all_finite=force_all_finite, copy=copy + X, Y, accept_sparse=False, ensure_all_finite=ensure_all_finite, copy=copy ) # Get missing mask for X missing_X = _get_mask(X, missing_values) @@ -1940,13 +1960,13 @@ def _parallel_pairwise(X, Y, func, n_jobs, **kwds): return ret -def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): +def _pairwise_callable(X, Y, metric, ensure_all_finite=True, **kwds): """Handle the callable case for pairwise_{distances,kernels}.""" X, Y = check_pairwise_arrays( X, Y, dtype=None, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ensure_2d=False, ) @@ -2221,7 +2241,12 @@ def pairwise_distances_chunked( "Y": ["array-like", "sparse matrix", None], "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], "n_jobs": [Integral, None], - "force_all_finite": ["boolean", StrOptions({"allow-nan"})], + "force_all_finite": [ + "boolean", + StrOptions({"allow-nan"}), + Hidden(StrOptions({"deprecated"})), + ], + "ensure_all_finite": ["boolean", StrOptions({"allow-nan"}), Hidden(None)], }, prefer_skip_nested_validation=True, ) @@ -2231,7 +2256,8 @@ def pairwise_distances( metric="euclidean", *, n_jobs=None, - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=None, **kwds, ): """Compute the distance matrix from a vector array X and optional Y. @@ -2331,6 +2357,23 @@ def pairwise_distances( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan`. + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored + for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still @@ -2362,9 +2405,11 @@ def pairwise_distances( array([[1., 2.], [2., 1.]]) """ + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + if metric == "precomputed": X, _ = check_pairwise_arrays( - X, Y, precomputed=True, force_all_finite=force_all_finite + X, Y, precomputed=True, ensure_all_finite=ensure_all_finite ) whom = ( @@ -2379,7 +2424,7 @@ def pairwise_distances( func = partial( _pairwise_callable, metric=metric, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, **kwds, ) else: @@ -2393,7 +2438,7 @@ def pairwise_distances( warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays( - X, Y, dtype=dtype, force_all_finite=force_all_finite + X, Y, dtype=dtype, ensure_all_finite=ensure_all_finite ) # precompute data-derived metric params diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index a1c6d00bed847..b3f8146b275c5 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1637,3 +1637,17 @@ def test_sparse_manhattan_readonly_dataset(csr_container): Parallel(n_jobs=2, max_nbytes=0)( delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) ) + + +# TODO(1.8): remove +def test_force_all_finite_rename_warning(): + X = np.random.uniform(size=(10, 10)) + Y = np.random.uniform(size=(10, 10)) + + msg = "'force_all_finite' was renamed to 'ensure_all_finite'" + + with pytest.warns(FutureWarning, match=msg): + check_pairwise_arrays(X, Y, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + pairwise_distances(X, Y, force_all_finite=True) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0c6252975a7f8..bacbbfd461995 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -124,7 +124,7 @@ class _ConstantPredictor(BaseEstimator): def fit(self, X, y): check_params = dict( - force_all_finite=False, dtype=None, ensure_2d=False, accept_sparse=True + ensure_all_finite=False, dtype=None, ensure_2d=False, accept_sparse=True ) self._validate_data( X, y, reset=True, validate_separately=(check_params, check_params) @@ -136,7 +136,7 @@ def predict(self, X): check_is_fitted(self) self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, @@ -149,7 +149,7 @@ def decision_function(self, X): check_is_fitted(self) self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, @@ -162,7 +162,7 @@ def predict_proba(self, X): check_is_fitted(self) self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, @@ -786,7 +786,7 @@ def fit(self, X, y, **fit_params): # We need to validate the data because we do a safe_indexing later. X, y = self._validate_data( - X, y, accept_sparse=["csr", "csc"], force_all_finite=False + X, y, accept_sparse=["csr", "csc"], ensure_all_finite=False ) check_classification_targets(y) @@ -889,7 +889,7 @@ def partial_fit(self, X, y, classes=None, **partial_fit_params): X, y, accept_sparse=["csr", "csc"], - force_all_finite=False, + ensure_all_finite=False, reset=first_call, ) check_classification_targets(y) @@ -962,7 +962,7 @@ def decision_function(self, X): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index e2d5a3dc24de8..f7f9932f590cd 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1421,14 +1421,14 @@ def _more_tags(self): def _check_X(self, X): """Validate X, used only in predict* methods.""" X = self._validate_data( - X, dtype="int", accept_sparse=False, force_all_finite=True, reset=False + X, dtype="int", accept_sparse=False, ensure_all_finite=True, reset=False ) check_non_negative(X, "CategoricalNB (input X)") return X def _check_X_y(self, X, y, reset=True): X, y = self._validate_data( - X, y, dtype="int", accept_sparse=False, force_all_finite=True, reset=reset + X, y, dtype="int", accept_sparse=False, ensure_all_finite=True, reset=reset ) check_non_negative(X, "CategoricalNB (input X)") return X, y diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 68a887f7e2042..62042c6ba831d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -218,7 +218,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): ensure_2d=False, estimator="the scale function", dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): if with_mean: @@ -485,7 +485,7 @@ def partial_fit(self, X, y=None): X, reset=first_pass, dtype=_array_api.supported_float_dtypes(xp), - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) data_min = _array_api._nanmin(X, axis=0, xp=xp) @@ -530,7 +530,7 @@ def transform(self, X): copy=self.copy, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", reset=False, ) @@ -562,7 +562,7 @@ def inverse_transform(self, X): copy=self.copy, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) X -= self.min_ @@ -668,7 +668,11 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): # Unlike the scaler object, this function allows 1d input. # If copy is required, it will be done inside the scaler object. X = check_array( - X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" + X, + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", ) original_ndim = X.ndim @@ -909,7 +913,7 @@ def partial_fit(self, X, y=None, sample_weight=None): X, accept_sparse=("csr", "csc"), dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", reset=first_call, ) n_features = X.shape[1] @@ -1043,7 +1047,7 @@ def transform(self, X, copy=None): copy=copy, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1085,7 +1089,7 @@ def inverse_transform(self, X, copy=None): copy=copy, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1247,7 +1251,7 @@ def partial_fit(self, X, y=None): reset=first_pass, accept_sparse=("csr", "csc"), dtype=_array_api.supported_float_dtypes(xp), - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1290,7 +1294,7 @@ def transform(self, X): reset=False, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1322,7 +1326,7 @@ def inverse_transform(self, X): copy=self.copy, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1417,7 +1421,7 @@ def maxabs_scale(X, *, axis=0, copy=True): copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) original_ndim = X.ndim @@ -1592,7 +1596,7 @@ def fit(self, X, y=None): X, accept_sparse="csc", dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) q_min, q_max = self.quantile_range @@ -1656,7 +1660,7 @@ def transform(self, X): dtype=FLOAT_DTYPES, force_writeable=True, reset=False, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1689,7 +1693,7 @@ def inverse_transform(self, X): copy=self.copy, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1822,7 +1826,7 @@ def robust_scale( copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) original_ndim = X.ndim @@ -2866,7 +2870,7 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): # only set force_writeable for the validation at transform time because # it's the only place where QuantileTransformer performs inplace operations. force_writeable=True if not in_fit else None, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. @@ -3506,7 +3510,7 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False): dtype=FLOAT_DTYPES, force_writeable=True, copy=self.copy, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", reset=in_fit, ) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index e3924c11fb635..50b3cd9498639 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -27,7 +27,7 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): """ - def _check_X(self, X, force_all_finite=True): + def _check_X(self, X, ensure_all_finite=True): """ Perform custom check_array: - convert list of strings to object dtype @@ -41,16 +41,16 @@ def _check_X(self, X, force_all_finite=True): """ if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2): # if not a dataframe, do normal check_array validation - X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite) + X_temp = check_array(X, dtype=None, ensure_all_finite=ensure_all_finite) if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=object, force_all_finite=force_all_finite) + X = check_array(X, dtype=object, ensure_all_finite=ensure_all_finite) else: X = X_temp needs_validation = False else: # pandas dataframe, do validation later column by column, in order # to keep the dtype information to be used in the encoder. - needs_validation = force_all_finite + needs_validation = ensure_all_finite n_samples, n_features = X.shape X_columns = [] @@ -58,7 +58,7 @@ def _check_X(self, X, force_all_finite=True): for i in range(n_features): Xi = _safe_indexing(X, indices=i, axis=1) Xi = check_array( - Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation + Xi, ensure_2d=False, dtype=None, ensure_all_finite=needs_validation ) X_columns.append(Xi) @@ -68,7 +68,7 @@ def _fit( self, X, handle_unknown="error", - force_all_finite=True, + ensure_all_finite=True, return_counts=False, return_and_ignore_missing_for_infrequent=False, ): @@ -76,7 +76,7 @@ def _fit( self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) X_list, n_samples, n_features = self._check_X( - X, force_all_finite=force_all_finite + X, ensure_all_finite=ensure_all_finite ) self.n_features_in_ = n_features @@ -186,12 +186,12 @@ def _transform( self, X, handle_unknown="error", - force_all_finite=True, + ensure_all_finite=True, warn_on_unknown=False, ignore_category_indices=None, ): X_list, n_samples, n_features = self._check_X( - X, force_all_finite=force_all_finite + X, ensure_all_finite=ensure_all_finite ) self._check_feature_names(X, reset=False) self._check_n_features(X, reset=False) @@ -975,7 +975,7 @@ def fit(self, X, y=None): self._fit( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) self._set_drop_idx() self._n_features_outs = self._compute_n_features_outs() @@ -1023,7 +1023,7 @@ def transform(self, X): X_int, X_mask = self._transform( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", warn_on_unknown=warn_on_unknown, ) @@ -1495,7 +1495,7 @@ def fit(self, X, y=None): fit_results = self._fit( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", return_and_ignore_missing_for_infrequent=True, ) self._missing_indices = fit_results["missing_indices"] @@ -1577,7 +1577,7 @@ def transform(self, X): X_int, X_mask = self._transform( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ignore_category_indices=self._missing_indices, ) X_trans = X_int.astype(self.dtype, copy=False) @@ -1606,7 +1606,7 @@ def inverse_transform(self, X): Inverse transformed array. """ check_is_fitted(self) - X = check_array(X, force_all_finite="allow-nan") + X = check_array(X, ensure_all_finite="allow-nan") n_samples, _ = X.shape n_features = len(self.categories_) diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index 1855f340c624d..842598daa2dd7 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -325,7 +325,7 @@ def transform(self, X): Transformed input. """ X_ordinal, X_known_mask = self._transform( - X, handle_unknown="ignore", force_all_finite="allow-nan" + X, handle_unknown="ignore", ensure_all_finite="allow-nan" ) # If 'multiclass' multiply axis=1 by num of classes else keep shape the same @@ -356,7 +356,7 @@ def _fit_encodings_all(self, X, y): ) check_consistent_length(X, y) - self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan") + self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan") if self.target_type == "auto": accepted_target_types = ("binary", "multiclass", "continuous") @@ -386,7 +386,7 @@ def _fit_encodings_all(self, X, y): self.target_mean_ = np.mean(y, axis=0) X_ordinal, X_known_mask = self._transform( - X, handle_unknown="ignore", force_all_finite="allow-nan" + X, handle_unknown="ignore", ensure_all_finite="allow-nan" ) n_categories = np.fromiter( (len(category_for_feature) for category_for_feature in self.categories_), diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index b1ebea1061e4c..916cf1a338cd2 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -283,7 +283,7 @@ def fit(self, X, y, **params): # we need row slicing support for sparse matrices, but costly finiteness check # can be delegated to the base estimator. X, y = self._validate_data( - X, y, accept_sparse=["csr", "csc", "lil", "dok"], force_all_finite=False + X, y, accept_sparse=["csr", "csc", "lil", "dok"], ensure_all_finite=False ) if y.dtype.kind in ["U", "S"]: @@ -415,7 +415,7 @@ def predict(self, X, **params): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.estimator_.predict(X, **routed_params.estimator.predict) @@ -457,7 +457,7 @@ def predict_proba(self, X, **params): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.estimator_.predict_proba(X, **routed_params.estimator.predict_proba) @@ -499,7 +499,7 @@ def decision_function(self, X, **params): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.estimator_.decision_function( @@ -543,7 +543,7 @@ def predict_log_proba(self, X, **params): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.estimator_.predict_log_proba( @@ -589,7 +589,7 @@ def score(self, X, y, **params): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.estimator_.score(X, y, **routed_params.estimator.score) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 29352d080414d..efa5eb6e8f84d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -238,7 +238,7 @@ def _fit( # _compute_missing_values_in_feature_mask will check for finite values and # compute the missing mask if the tree supports missing values check_X_params = dict( - dtype=DTYPE, accept_sparse="csc", force_all_finite=False + dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data( @@ -475,15 +475,15 @@ def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: if self._support_missing_values(X): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True X = self._validate_data( X, dtype=DTYPE, accept_sparse="csr", reset=False, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) if issparse(X) and ( X.indices.dtype != np.intc or X.indptr.dtype != np.intc diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py index b3eca8053b94a..ff08ec2aceb81 100644 --- a/sklearn/utils/deprecation.py +++ b/sklearn/utils/deprecation.py @@ -136,3 +136,28 @@ def _deprecate_Xt_in_inverse_transform(X, Xt): return Xt return X + + +# TODO(1.8): remove force_all_finite and change the default value of ensure_all_finite +# to True (remove None without deprecation). +def _deprecate_force_all_finite(force_all_finite, ensure_all_finite): + """Helper to deprecate force_all_finite in favor of ensure_all_finite.""" + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8.", + FutureWarning, + ) + + if ensure_all_finite is not None: + raise ValueError( + "'force_all_finite' and 'ensure_all_finite' cannot be used together. " + "Pass `ensure_all_finite` only." + ) + + return force_all_finite + + if ensure_all_finite is None: + return True + + return ensure_all_finite diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index ead7f416368e3..307e4269747b7 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -157,7 +157,7 @@ def is_multilabel(y): check_y_kwargs = dict( accept_sparse=True, allow_nd=True, - force_all_finite=False, + ensure_all_finite=False, ensure_2d=False, ensure_min_samples=0, ensure_min_features=0, @@ -320,7 +320,7 @@ def type_of_target(y, input_name=""): check_y_kwargs = dict( accept_sparse=True, allow_nd=True, - force_all_finite=False, + ensure_all_finite=False, ensure_2d=False, ensure_min_samples=0, ensure_min_features=0, diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 070a47c6700b2..b99dec99498ab 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -150,7 +150,7 @@ def test_as_float_array(): def test_as_float_array_nan(X): X[5, 0] = np.nan X[6, 1] = np.nan - X_converted = as_float_array(X, force_all_finite="allow-nan") + X_converted = as_float_array(X, ensure_all_finite="allow-nan") assert_allclose_dense_sparse(X_converted, X) @@ -198,18 +198,19 @@ def test_ordering(): @pytest.mark.parametrize( - "value, force_all_finite", [(np.inf, False), (np.nan, "allow-nan"), (np.nan, False)] + "value, ensure_all_finite", + [(np.inf, False), (np.nan, "allow-nan"), (np.nan, False)], ) @pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) -def test_check_array_force_all_finite_valid(value, force_all_finite, retype): +def test_check_array_ensure_all_finite_valid(value, ensure_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(float)) X[0, 0] = value - X_checked = check_array(X, force_all_finite=force_all_finite, accept_sparse=True) + X_checked = check_array(X, ensure_all_finite=ensure_all_finite, accept_sparse=True) assert_allclose_dense_sparse(X, X_checked) @pytest.mark.parametrize( - "value, input_name, force_all_finite, match_msg", + "value, input_name, ensure_all_finite, match_msg", [ (np.inf, "", True, "Input contains infinity"), (np.inf, "X", True, "Input X contains infinity"), @@ -222,14 +223,14 @@ def test_check_array_force_all_finite_valid(value, force_all_finite, retype): np.nan, "", "allow-inf", - 'force_all_finite should be a bool or "allow-nan"', + "ensure_all_finite should be a bool or 'allow-nan'", ), (np.nan, "", 1, "Input contains NaN"), ], ) @pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) -def test_check_array_force_all_finiteinvalid( - value, input_name, force_all_finite, match_msg, retype +def test_check_array_ensure_all_finite_invalid( + value, input_name, ensure_all_finite, match_msg, retype ): X = retype(np.arange(4).reshape(2, 2).astype(np.float64)) X[0, 0] = value @@ -237,7 +238,7 @@ def test_check_array_force_all_finiteinvalid( check_array( X, input_name=input_name, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, accept_sparse=True, ) @@ -284,17 +285,17 @@ def test_check_array_links_to_imputer_doc_only_for_X(input_name, retype): assert extended_msg in ctx.value.args[0] -def test_check_array_force_all_finite_object(): +def test_check_array_ensure_all_finite_object(): X = np.array([["a", "b", np.nan]], dtype=object).T - X_checked = check_array(X, dtype=None, force_all_finite="allow-nan") + X_checked = check_array(X, dtype=None, ensure_all_finite="allow-nan") assert X is X_checked - X_checked = check_array(X, dtype=None, force_all_finite=False) + X_checked = check_array(X, dtype=None, ensure_all_finite=False) assert X is X_checked with pytest.raises(ValueError, match="Input contains NaN"): - check_array(X, dtype=None, force_all_finite=True) + check_array(X, dtype=None, ensure_all_finite=True) @pytest.mark.parametrize( @@ -315,14 +316,14 @@ def test_check_array_force_all_finite_object(): (np.array([[1, np.nan]], dtype=object), "cannot convert float NaN to integer"), ], ) -@pytest.mark.parametrize("force_all_finite", [True, False]) -def test_check_array_force_all_finite_object_unsafe_casting( - X, err_msg, force_all_finite +@pytest.mark.parametrize("ensure_all_finite", [True, False]) +def test_check_array_ensure_all_finite_object_unsafe_casting( + X, err_msg, ensure_all_finite ): # casting a float array containing NaN or inf to int dtype should - # raise an error irrespective of the force_all_finite parameter. + # raise an error irrespective of the ensure_all_finite parameter. with pytest.raises(ValueError, match=err_msg): - check_array(X, dtype=int, force_all_finite=force_all_finite) + check_array(X, dtype=int, ensure_all_finite=ensure_all_finite) def test_check_array_series_err_msg(): @@ -507,17 +508,17 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): X = pd.DataFrame(X_np, dtype=pd_dtype, columns=["a", "b", "c"]) # column c has no nans X["c"] = X["c"].astype("float") - X_checked = check_array(X, force_all_finite="allow-nan", dtype=dtype) + X_checked = check_array(X, ensure_all_finite="allow-nan", dtype=dtype) assert_allclose(X_checked, X_np) assert X_checked.dtype == expected_dtype - X_checked = check_array(X, force_all_finite=False, dtype=dtype) + X_checked = check_array(X, ensure_all_finite=False, dtype=dtype) assert_allclose(X_checked, X_np) assert X_checked.dtype == expected_dtype msg = "Input contains NaN" with pytest.raises(ValueError, match=msg): - check_array(X, force_all_finite=True) + check_array(X, ensure_all_finite=True) def test_check_array_panadas_na_support_series(): @@ -528,14 +529,14 @@ def test_check_array_panadas_na_support_series(): msg = "Input contains NaN" with pytest.raises(ValueError, match=msg): - check_array(X_int64, force_all_finite=True, ensure_2d=False) + check_array(X_int64, ensure_all_finite=True, ensure_2d=False) - X_out = check_array(X_int64, force_all_finite=False, ensure_2d=False) + X_out = check_array(X_int64, ensure_all_finite=False, ensure_2d=False) assert_allclose(X_out, [1, 2, np.nan]) assert X_out.dtype == np.float64 X_out = check_array( - X_int64, force_all_finite=False, ensure_2d=False, dtype=np.float32 + X_int64, ensure_all_finite=False, ensure_2d=False, dtype=np.float32 ) assert_allclose(X_out, [1, 2, np.nan]) assert X_out.dtype == np.float32 @@ -1992,7 +1993,7 @@ def test_pandas_array_returns_ndarray(input_values): dtype=None, ensure_2d=False, allow_nd=False, - force_all_finite=False, + ensure_all_finite=False, ) assert np.issubdtype(result.dtype.kind, np.floating) assert_allclose(result, input_values) @@ -2186,3 +2187,20 @@ def test_check_array_writeable_df(): # df is backed by a read-only array, a copy is made assert not np.may_share_memory(out, df) assert out.flags.writeable + + +# TODO(1.8): remove +def test_force_all_finite_rename_warning(): + X = np.random.uniform(size=(10, 10)) + y = np.random.randint(1, size=(10,)) + + msg = "'force_all_finite' was renamed to 'ensure_all_finite'" + + with pytest.warns(FutureWarning, match=msg): + check_array(X, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + check_X_y(X, y, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + as_float_array(X, force_all_finite=True) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index af9fdb4a79cba..4bbd922d28e37 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -18,6 +18,7 @@ from .. import get_config as _get_config from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace +from ..utils.deprecation import _deprecate_force_all_finite from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype from ._isfinite import FiniteStatus, cy_isfinite from .fixes import _object_dtype_isnan @@ -212,7 +213,9 @@ def assert_all_finite( ) -def as_float_array(X, *, copy=True, force_all_finite=True): +def as_float_array( + X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=None +): """Convert an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original @@ -243,6 +246,22 @@ def as_float_array(X, *, copy=True, force_all_finite=True): .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in X. The + possibilities are: + + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + Returns ------- XT : {ndarray, sparse matrix} @@ -256,6 +275,8 @@ def as_float_array(X, *, copy=True, force_all_finite=True): >>> as_float_array(array) array([0., 0., 1., 2., 2.]) """ + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + if isinstance(X, np.matrix) or ( not isinstance(X, np.ndarray) and not sp.issparse(X) ): @@ -264,7 +285,7 @@ def as_float_array(X, *, copy=True, force_all_finite=True): accept_sparse=["csr", "csc", "coo"], dtype=np.float64, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ensure_2d=False, ) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: @@ -514,7 +535,7 @@ def _ensure_sparse_format( accept_sparse, dtype, copy, - force_all_finite, + ensure_all_finite, accept_large_sparse, estimator_name=None, input_name="", @@ -542,7 +563,7 @@ def _ensure_sparse_format( Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. - force_all_finite : bool or 'allow-nan' + ensure_all_finite : bool or 'allow-nan' Whether to raise an error on np.inf, np.nan, pd.NA in X. The possibilities are: @@ -552,7 +573,7 @@ def _ensure_sparse_format( be infinite. .. versionadded:: 0.20 - ``force_all_finite`` accepts the string ``'allow-nan'``. + ``ensure_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` @@ -615,7 +636,7 @@ def _ensure_sparse_format( # force copy sparse_container = sparse_container.copy() - if force_all_finite: + if ensure_all_finite: if not hasattr(sparse_container, "data"): warnings.warn( f"Can't check {sparse_container.format} sparse matrix for nan or inf.", @@ -624,7 +645,7 @@ def _ensure_sparse_format( else: _assert_all_finite( sparse_container.data, - allow_nan=force_all_finite == "allow-nan", + allow_nan=ensure_all_finite == "allow-nan", estimator_name=estimator_name, input_name=input_name, ) @@ -718,7 +739,8 @@ def check_array( order=None, copy=False, force_writeable=False, - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=None, ensure_2d=True, allow_nd=False, ensure_min_samples=1, @@ -790,6 +812,22 @@ def check_array( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + ensure_2d : bool, default=True Whether to raise a value error if array is not 2D. @@ -831,6 +869,8 @@ def check_array( >>> X_checked array([[1, 2, 3], [4, 5, 6]]) """ + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + if isinstance(array, np.matrix): raise TypeError( "np.matrix is not supported. Please convert to a numpy array with " @@ -924,11 +964,10 @@ def is_sparse(dtype): # Since we converted here, we do not need to convert again later dtype = None - if force_all_finite not in (True, False, "allow-nan"): + if ensure_all_finite not in (True, False, "allow-nan"): raise ValueError( - 'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format( - force_all_finite - ) + "ensure_all_finite should be a bool or 'allow-nan'. Got " + f"{ensure_all_finite!r} instead." ) if dtype is not None and _is_numpy_namespace(xp): @@ -967,7 +1006,7 @@ def is_sparse(dtype): accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, accept_large_sparse=accept_large_sparse, estimator_name=estimator_name, input_name=input_name, @@ -1054,12 +1093,12 @@ def is_sparse(dtype): % (array.ndim, estimator_name) ) - if force_all_finite: + if ensure_all_finite: _assert_all_finite( array, input_name=input_name, estimator_name=estimator_name, - allow_nan=force_all_finite == "allow-nan", + allow_nan=ensure_all_finite == "allow-nan", ) if copy: @@ -1155,7 +1194,8 @@ def check_X_y( order=None, copy=False, force_writeable=False, - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=None, ensure_2d=True, allow_nd=False, multi_output=False, @@ -1217,7 +1257,7 @@ def check_X_y( .. versionadded:: 1.6 force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter + Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are: @@ -1232,6 +1272,23 @@ def check_X_y( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter + does not influence whether y can have np.inf, np.nan, pd.NA values. + The possibilities are: + + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot + be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + ensure_2d : bool, default=True Whether to raise a value error if X is not 2D. @@ -1292,6 +1349,8 @@ def check_X_y( f"{estimator_name} requires y to be passed, but the target y is None" ) + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + X = check_array( X, accept_sparse=accept_sparse, @@ -1300,7 +1359,7 @@ def check_X_y( order=order, copy=copy, force_writeable=force_writeable, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ensure_2d=ensure_2d, allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, @@ -1322,7 +1381,7 @@ def _check_y(y, multi_output=False, y_numeric=False, estimator=None): y = check_array( y, accept_sparse="csr", - force_all_finite=True, + ensure_all_finite=True, ensure_2d=False, dtype=None, input_name="y", @@ -1377,7 +1436,7 @@ def column_or_1d(y, *, dtype=None, warn=False): ensure_2d=False, dtype=dtype, input_name="y", - force_all_finite=False, + ensure_all_finite=False, ensure_min_samples=0, )