From 35bf0c8a33dcbe86a463517603e9cff5f60f1e01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 3 Jul 2024 16:08:52 +0200 Subject: [PATCH 01/12] rename force_all_finite into ensure_all_finite --- sklearn/cluster/_hdbscan/hdbscan.py | 4 +- sklearn/compose/_column_transformer.py | 4 +- sklearn/compose/_target.py | 2 +- sklearn/covariance/_empirical_covariance.py | 2 +- sklearn/ensemble/_bagging.py | 10 +- sklearn/ensemble/_forest.py | 8 +- sklearn/ensemble/_gb.py | 2 +- .../_hist_gradient_boosting/binning.py | 4 +- .../gradient_boosting.py | 2 +- sklearn/feature_selection/_base.py | 2 +- sklearn/feature_selection/_rfe.py | 4 +- sklearn/feature_selection/_sequential.py | 2 +- .../feature_selection/_variance_threshold.py | 2 +- sklearn/impute/_base.py | 12 +- sklearn/impute/_iterative.py | 8 +- sklearn/impute/_knn.py | 14 +- sklearn/inspection/_partial_dependence.py | 2 +- sklearn/inspection/_permutation_importance.py | 2 +- .../inspection/_plot/partial_dependence.py | 2 +- sklearn/linear_model/_omp.py | 2 +- sklearn/linear_model/_ransac.py | 6 +- sklearn/metrics/pairwise.py | 102 ++++++++++-- sklearn/multiclass.py | 14 +- sklearn/naive_bayes.py | 4 +- sklearn/preprocessing/_data.py | 40 +++-- sklearn/preprocessing/_encoders.py | 28 ++-- sklearn/preprocessing/_target_encoder.py | 6 +- sklearn/semi_supervised/_self_training.py | 12 +- sklearn/tree/_classes.py | 8 +- sklearn/utils/multiclass.py | 4 +- sklearn/utils/tests/test_validation.py | 51 +++--- sklearn/utils/validation.py | 154 ++++++++++++++---- 32 files changed, 341 insertions(+), 178 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index d20e745309fca..c02a6fc2338c3 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -734,7 +734,7 @@ def fit(self, X, y=None): X = self._validate_data( X, accept_sparse=["csr", "lil"], - force_all_finite=False, + ensure_all_finite=False, dtype=np.float64, ) self._raw_data = X @@ -779,7 +779,7 @@ def fit(self, X, y=None): # Perform data validation after removing infinite values (numpy.inf) # from the given distance matrix. X = self._validate_data( - X, force_all_finite=False, dtype=np.float64, force_writeable=True + X, ensure_all_finite=False, dtype=np.float64, force_writeable=True ) if np.isnan(X).any(): # TODO: Support np.nan in Cython implementation for precomputed diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index a5aa7db17d4ae..a6ce849cd1463 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1107,7 +1107,7 @@ def _hstack(self, Xs, *, n_samples): # in a sparse matrix, `check_array` is used for the # dtype conversion if necessary. converted_Xs = [ - check_array(X, accept_sparse=True, force_all_finite=False) + check_array(X, accept_sparse=True, ensure_all_finite=False) for X in Xs ] except ValueError as e: @@ -1295,7 +1295,7 @@ def _check_X(X): """Use check_array only when necessary, e.g. on lists and other non-array-likes.""" if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X): return X - return check_array(X, force_all_finite="allow-nan", dtype=object) + return check_array(X, ensure_all_finite="allow-nan", dtype=object) def _is_empty_column_selection(column): diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index ac33957b23ce2..6f5bf73d591d8 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -263,7 +263,7 @@ def fit(self, X, y, **fit_params): y, input_name="y", accept_sparse=False, - force_all_finite=True, + ensure_all_finite=True, ensure_2d=False, dtype="numeric", allow_nd=True, diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 1e194857c64a2..bd2364ab472bb 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -90,7 +90,7 @@ def empirical_covariance(X, *, assume_centered=False): [0.25, 0.25, 0.25], [0.25, 0.25, 0.25]]) """ - X = check_array(X, ensure_2d=False, force_all_finite=False) + X = check_array(X, ensure_2d=False, ensure_all_finite=False) if X.ndim == 1: X = np.reshape(X, (1, -1)) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 81d3163556626..24bbda275c12d 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -391,7 +391,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): y, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, multi_output=True, ) @@ -941,7 +941,7 @@ def predict_proba(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) @@ -991,7 +991,7 @@ def predict_log_proba(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) @@ -1046,7 +1046,7 @@ def decision_function(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) @@ -1279,7 +1279,7 @@ def predict(self, X): X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d55a0c645e929..cd09c7571a33e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -362,7 +362,7 @@ def fit(self, X, y, sample_weight=None): multi_output=True, accept_sparse="csc", dtype=DTYPE, - force_all_finite=False, + ensure_all_finite=False, ) # _compute_missing_values_in_feature_mask checks if X has missing values and # will raise an error if the underlying tree base estimator can't handle missing @@ -630,16 +630,16 @@ def _validate_X_predict(self, X): Validate X whenever one tries to predict, apply, predict_proba.""" check_is_fitted(self) if self.estimators_[0]._support_missing_values(X): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True X = self._validate_data( X, dtype=DTYPE, accept_sparse="csr", reset=False, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based sparse matrices") diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index c280ca695bcfd..63792f2c44975 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -773,7 +773,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): dtype=DTYPE, order="C", accept_sparse="csr", - force_all_finite=False, + ensure_all_finite=False, ) raw_predictions = self._raw_predict(X_train) self._resize_state() diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 07a0651f8edc4..9db2ebe9c4592 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -193,7 +193,7 @@ def fit(self, X, y=None): ) ) - X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) + X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False) max_bins = self.n_bins - 1 rng = check_random_state(self.random_state) @@ -276,7 +276,7 @@ def transform(self, X): X_binned : array-like of shape (n_samples, n_features) The binned data (fortran-aligned). """ - X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) + X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False) check_is_fitted(self) if X.shape[1] != self.n_bins_non_missing_.shape[0]: raise ValueError( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 78f8456e969de..aa3f13d483602 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -262,7 +262,7 @@ def _preprocess_X(self, X, *, reset): """ # If there is a preprocessor, we let the preprocessor handle the validation. # Otherwise, we validate the data ourselves. - check_X_kwargs = dict(dtype=[X_DTYPE], force_all_finite=False) + check_X_kwargs = dict(dtype=[X_DTYPE], ensure_all_finite=False) if not reset: if self._preprocessor is None: return self._validate_data(X, reset=False, **check_X_kwargs) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 12553403f4b90..7504f7345a264 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -103,7 +103,7 @@ def transform(self, X): X, dtype=None, accept_sparse="csr", - force_all_finite=not _safe_tags(self, key="allow_nan"), + ensure_all_finite=not _safe_tags(self, key="allow_nan"), cast_to_ndarray=not preserve_X, reset=False, ) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index be71efaf27a0e..73c5342f13863 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -274,7 +274,7 @@ def _fit(self, X, y, step_score=None, **fit_params): y, accept_sparse="csc", ensure_min_features=2, - force_all_finite=False, + ensure_all_finite=False, multi_output=True, ) @@ -725,7 +725,7 @@ def fit(self, X, y, groups=None): y, accept_sparse="csr", ensure_min_features=2, - force_all_finite=False, + ensure_all_finite=False, multi_output=True, ) diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 9c393724f9cea..f76301fe644d7 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -211,7 +211,7 @@ def fit(self, X, y=None): X, accept_sparse="csc", ensure_min_features=2, - force_all_finite=not tags.get("allow_nan", True), + ensure_all_finite=not tags.get("allow_nan", True), ) n_features = X.shape[1] diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index f97c75db1e34b..a67c2fdedb86d 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -100,7 +100,7 @@ def fit(self, X, y=None): X, accept_sparse=("csr", "csc"), dtype=np.float64, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if hasattr(X, "toarray"): # sparse matrix diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 6109e3fde7b2a..5674bdc5ba85a 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -323,9 +323,9 @@ def _validate_input(self, X, in_fit): dtype = self._fit_dtype if is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True try: X = self._validate_data( @@ -334,7 +334,7 @@ def _validate_input(self, X, in_fit): accept_sparse="csc", dtype=dtype, force_writeable=True if not in_fit else None, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, copy=self.copy, ) except ValueError as ve: @@ -893,15 +893,15 @@ def _get_missing_features_info(self, X): def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): - force_all_finite = True + ensure_all_finite = True else: - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" X = self._validate_data( X, reset=in_fit, accept_sparse=("csc", "csr"), dtype=None, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 41f903061c34d..4f7eb1a0fbb07 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -611,16 +611,16 @@ def _initial_imputation(self, X, in_fit=False): number of features. """ if is_scalar_nan(self.missing_values): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True X = self._validate_data( X, dtype=FLOAT_DTYPES, order="F", reset=in_fit, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) _check_inputs_dtype(X, self.missing_values) @@ -677,7 +677,7 @@ def _validate_limit(limit, limit_type, n_features): limit = limit_bound if limit is None else limit if np.isscalar(limit): limit = np.full(n_features, limit) - limit = check_array(limit, force_all_finite=False, copy=False, ensure_2d=False) + limit = check_array(limit, ensure_all_finite=False, copy=False, ensure_2d=False) if not limit.shape[0] == n_features: raise ValueError( f"'{limit_type}_value' should be of " diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 2e18246b4b9bb..f22e1de79cd85 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -225,15 +225,15 @@ def fit(self, X, y=None): """ # Check data integrity and calling arguments if not is_scalar_nan(self.missing_values): - force_all_finite = True + ensure_all_finite = True else: - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" X = self._validate_data( X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, copy=self.copy, ) @@ -262,15 +262,15 @@ def transform(self, X): check_is_fitted(self) if not is_scalar_nan(self.missing_values): - force_all_finite = True + ensure_all_finite = True else: - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" X = self._validate_data( X, accept_sparse=False, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, copy=self.copy, reset=False, ) @@ -363,7 +363,7 @@ def process_chunk(dist_chunk, start): self._fit_X, metric=self.metric, missing_values=self.missing_values, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, reduce_func=process_chunk, ) for chunk in gen: diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index b74258615a447..913976b2544e3 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -566,7 +566,7 @@ def partial_dependence( # Use check_array only on lists and other non-array-likes / sparse. Do not # convert DataFrame into a NumPy array. if not (hasattr(X, "__array__") or sparse.issparse(X)): - X = check_array(X, force_all_finite="allow-nan", dtype=object) + X = check_array(X, ensure_all_finite="allow-nan", dtype=object) if is_regressor(estimator) and response_method != "auto": raise ValueError( diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 659db143153cc..fd128ba3ebbac 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -263,7 +263,7 @@ def permutation_importance( array([0.2211..., 0. , 0. ]) """ if not hasattr(X, "iloc"): - X = check_array(X, force_all_finite="allow-nan", dtype=None) + X = check_array(X, ensure_all_finite="allow-nan", dtype=None) # Precompute random seed from the random state to be used # to get a fresh independent RandomState instance for each diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index 3d516d727192e..007191f89d506 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -540,7 +540,7 @@ def from_estimator( # Use check_array only on lists and other non-array-likes / sparse. Do not # convert DataFrame into a NumPy array. if not (hasattr(X, "__array__") or sparse.issparse(X)): - X = check_array(X, force_all_finite="allow-nan", dtype=object) + X = check_array(X, ensure_all_finite="allow-nan", dtype=object) n_features = X.shape[1] feature_names = _check_feature_names(X, feature_names) diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index eb4e52be2656c..cf051228d210a 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -1056,7 +1056,7 @@ def fit(self, X, y, **fit_params): _raise_for_params(fit_params, self, "fit") X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2) - X = as_float_array(X, copy=False, force_all_finite=False) + X = as_float_array(X, copy=False, ensure_all_finite=False) cv = check_cv(self.cv, classifier=False) if _routing_enabled(): routed_params = process_routing(self, "fit", **fit_params) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 631a2c1c66815..6943894002c7c 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -363,7 +363,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): # because that would allow y to be csr. Delay expensive finiteness # check to the estimator's own input validation. _raise_for_params(fit_params, self, "fit") - check_X_params = dict(accept_sparse="csr", force_all_finite=False) + check_X_params = dict(accept_sparse="csr", ensure_all_finite=False) check_y_params = dict(ensure_2d=False) X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) @@ -630,7 +630,7 @@ def predict(self, X, **params): check_is_fitted(self) X = self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, accept_sparse=True, reset=False, ) @@ -678,7 +678,7 @@ def score(self, X, y, **params): check_is_fitted(self) X = self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, accept_sparse=True, reset=False, ) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9382d585a5fe7..42c2fb96db7ad 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -77,7 +77,8 @@ def check_pairwise_arrays( precomputed=False, dtype="infer_float", accept_sparse="csr", - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=True, ensure_2d=True, copy=False, ): @@ -127,11 +128,33 @@ def check_pairwise_arrays( - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. - .. versionadded:: 0.22 + .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan`. + Accepts `pd.NA` and converts it into `np.nan` + + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.20 + Accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. ensure_2d : bool, default=True Whether to raise an error when the input arrays are not 2-dimensional. Setting @@ -155,6 +178,14 @@ def check_pairwise_arrays( An array equal to Y if Y was not None, guaranteed to be a numpy array. If Y was None, safe_Y will be a pointer to X. """ + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8", + FutureWarning, + ) + ensure_all_finite = force_all_finite + xp, _ = get_namespace(X, Y) if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp): X, Y, dtype_float = _return_float_dtype(X, Y) @@ -171,7 +202,7 @@ def check_pairwise_arrays( accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, estimator=estimator, ensure_2d=ensure_2d, ) @@ -181,7 +212,7 @@ def check_pairwise_arrays( accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, estimator=estimator, ensure_2d=ensure_2d, ) @@ -190,7 +221,7 @@ def check_pairwise_arrays( accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, estimator=estimator, ensure_2d=ensure_2d, ) @@ -499,9 +530,9 @@ def nan_euclidean_distances( [1.41421356]]) """ - force_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True + ensure_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True X, Y = check_pairwise_arrays( - X, Y, accept_sparse=False, force_all_finite=force_all_finite, copy=copy + X, Y, accept_sparse=False, ensure_all_finite=ensure_all_finite, copy=copy ) # Get missing mask for X missing_X = _get_mask(X, missing_values) @@ -1916,13 +1947,13 @@ def _parallel_pairwise(X, Y, func, n_jobs, **kwds): return ret -def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): +def _pairwise_callable(X, Y, metric, ensure_all_finite=True, **kwds): """Handle the callable case for pairwise_{distances,kernels}.""" X, Y = check_pairwise_arrays( X, Y, dtype=None, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ensure_2d=False, ) @@ -2197,7 +2228,12 @@ def pairwise_distances_chunked( "Y": ["array-like", "sparse matrix", None], "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], "n_jobs": [Integral, None], - "force_all_finite": ["boolean", StrOptions({"allow-nan"})], + "force_all_finite": [ + "boolean", + StrOptions({"allow-nan"}), + Hidden(StrOptions({"deprecated"})), + ], + "ensure_all_finite": ["boolean", StrOptions({"allow-nan"})], }, prefer_skip_nested_validation=True, ) @@ -2207,7 +2243,8 @@ def pairwise_distances( metric="euclidean", *, n_jobs=None, - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=True, **kwds, ): """Compute the distance matrix from a vector array X and optional Y. @@ -2292,8 +2329,7 @@ def pairwise_distances( and quickly degrade performance. force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored - for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. @@ -2305,7 +2341,29 @@ def pairwise_distances( ``force_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan`. + Accepts `pd.NA` and converts it into `np.nan` + + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.22 + Accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. @@ -2338,9 +2396,17 @@ def pairwise_distances( array([[1., 2.], [2., 1.]]) """ + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8", + FutureWarning, + ) + ensure_all_finite = force_all_finite + if metric == "precomputed": X, _ = check_pairwise_arrays( - X, Y, precomputed=True, force_all_finite=force_all_finite + X, Y, precomputed=True, ensure_all_finite=ensure_all_finite ) whom = ( @@ -2355,7 +2421,7 @@ def pairwise_distances( func = partial( _pairwise_callable, metric=metric, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, **kwds, ) else: @@ -2369,7 +2435,7 @@ def pairwise_distances( warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays( - X, Y, dtype=dtype, force_all_finite=force_all_finite + X, Y, dtype=dtype, ensure_all_finite=ensure_all_finite ) # precompute data-derived metric params diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0c6252975a7f8..bacbbfd461995 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -124,7 +124,7 @@ class _ConstantPredictor(BaseEstimator): def fit(self, X, y): check_params = dict( - force_all_finite=False, dtype=None, ensure_2d=False, accept_sparse=True + ensure_all_finite=False, dtype=None, ensure_2d=False, accept_sparse=True ) self._validate_data( X, y, reset=True, validate_separately=(check_params, check_params) @@ -136,7 +136,7 @@ def predict(self, X): check_is_fitted(self) self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, @@ -149,7 +149,7 @@ def decision_function(self, X): check_is_fitted(self) self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, @@ -162,7 +162,7 @@ def predict_proba(self, X): check_is_fitted(self) self._validate_data( X, - force_all_finite=False, + ensure_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, @@ -786,7 +786,7 @@ def fit(self, X, y, **fit_params): # We need to validate the data because we do a safe_indexing later. X, y = self._validate_data( - X, y, accept_sparse=["csr", "csc"], force_all_finite=False + X, y, accept_sparse=["csr", "csc"], ensure_all_finite=False ) check_classification_targets(y) @@ -889,7 +889,7 @@ def partial_fit(self, X, y, classes=None, **partial_fit_params): X, y, accept_sparse=["csr", "csc"], - force_all_finite=False, + ensure_all_finite=False, reset=first_call, ) check_classification_targets(y) @@ -962,7 +962,7 @@ def decision_function(self, X): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 7992a911c1be1..a3af861fda23f 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1420,14 +1420,14 @@ def _more_tags(self): def _check_X(self, X): """Validate X, used only in predict* methods.""" X = self._validate_data( - X, dtype="int", accept_sparse=False, force_all_finite=True, reset=False + X, dtype="int", accept_sparse=False, ensure_all_finite=True, reset=False ) check_non_negative(X, "CategoricalNB (input X)") return X def _check_X_y(self, X, y, reset=True): X, y = self._validate_data( - X, y, dtype="int", accept_sparse=False, force_all_finite=True, reset=reset + X, y, dtype="int", accept_sparse=False, ensure_all_finite=True, reset=reset ) check_non_negative(X, "CategoricalNB (input X)") return X, y diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 68a887f7e2042..62042c6ba831d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -218,7 +218,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): ensure_2d=False, estimator="the scale function", dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): if with_mean: @@ -485,7 +485,7 @@ def partial_fit(self, X, y=None): X, reset=first_pass, dtype=_array_api.supported_float_dtypes(xp), - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) data_min = _array_api._nanmin(X, axis=0, xp=xp) @@ -530,7 +530,7 @@ def transform(self, X): copy=self.copy, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", reset=False, ) @@ -562,7 +562,7 @@ def inverse_transform(self, X): copy=self.copy, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) X -= self.min_ @@ -668,7 +668,11 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): # Unlike the scaler object, this function allows 1d input. # If copy is required, it will be done inside the scaler object. X = check_array( - X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" + X, + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", ) original_ndim = X.ndim @@ -909,7 +913,7 @@ def partial_fit(self, X, y=None, sample_weight=None): X, accept_sparse=("csr", "csc"), dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", reset=first_call, ) n_features = X.shape[1] @@ -1043,7 +1047,7 @@ def transform(self, X, copy=None): copy=copy, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1085,7 +1089,7 @@ def inverse_transform(self, X, copy=None): copy=copy, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1247,7 +1251,7 @@ def partial_fit(self, X, y=None): reset=first_pass, accept_sparse=("csr", "csc"), dtype=_array_api.supported_float_dtypes(xp), - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1290,7 +1294,7 @@ def transform(self, X): reset=False, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1322,7 +1326,7 @@ def inverse_transform(self, X): copy=self.copy, dtype=_array_api.supported_float_dtypes(xp), force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1417,7 +1421,7 @@ def maxabs_scale(X, *, axis=0, copy=True): copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) original_ndim = X.ndim @@ -1592,7 +1596,7 @@ def fit(self, X, y=None): X, accept_sparse="csc", dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) q_min, q_max = self.quantile_range @@ -1656,7 +1660,7 @@ def transform(self, X): dtype=FLOAT_DTYPES, force_writeable=True, reset=False, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1689,7 +1693,7 @@ def inverse_transform(self, X): copy=self.copy, dtype=FLOAT_DTYPES, force_writeable=True, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) if sparse.issparse(X): @@ -1822,7 +1826,7 @@ def robust_scale( copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) original_ndim = X.ndim @@ -2866,7 +2870,7 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): # only set force_writeable for the validation at transform time because # it's the only place where QuantileTransformer performs inplace operations. force_writeable=True if not in_fit else None, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. @@ -3506,7 +3510,7 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False): dtype=FLOAT_DTYPES, force_writeable=True, copy=self.copy, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", reset=in_fit, ) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index e3924c11fb635..50b3cd9498639 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -27,7 +27,7 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): """ - def _check_X(self, X, force_all_finite=True): + def _check_X(self, X, ensure_all_finite=True): """ Perform custom check_array: - convert list of strings to object dtype @@ -41,16 +41,16 @@ def _check_X(self, X, force_all_finite=True): """ if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2): # if not a dataframe, do normal check_array validation - X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite) + X_temp = check_array(X, dtype=None, ensure_all_finite=ensure_all_finite) if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=object, force_all_finite=force_all_finite) + X = check_array(X, dtype=object, ensure_all_finite=ensure_all_finite) else: X = X_temp needs_validation = False else: # pandas dataframe, do validation later column by column, in order # to keep the dtype information to be used in the encoder. - needs_validation = force_all_finite + needs_validation = ensure_all_finite n_samples, n_features = X.shape X_columns = [] @@ -58,7 +58,7 @@ def _check_X(self, X, force_all_finite=True): for i in range(n_features): Xi = _safe_indexing(X, indices=i, axis=1) Xi = check_array( - Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation + Xi, ensure_2d=False, dtype=None, ensure_all_finite=needs_validation ) X_columns.append(Xi) @@ -68,7 +68,7 @@ def _fit( self, X, handle_unknown="error", - force_all_finite=True, + ensure_all_finite=True, return_counts=False, return_and_ignore_missing_for_infrequent=False, ): @@ -76,7 +76,7 @@ def _fit( self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) X_list, n_samples, n_features = self._check_X( - X, force_all_finite=force_all_finite + X, ensure_all_finite=ensure_all_finite ) self.n_features_in_ = n_features @@ -186,12 +186,12 @@ def _transform( self, X, handle_unknown="error", - force_all_finite=True, + ensure_all_finite=True, warn_on_unknown=False, ignore_category_indices=None, ): X_list, n_samples, n_features = self._check_X( - X, force_all_finite=force_all_finite + X, ensure_all_finite=ensure_all_finite ) self._check_feature_names(X, reset=False) self._check_n_features(X, reset=False) @@ -975,7 +975,7 @@ def fit(self, X, y=None): self._fit( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ) self._set_drop_idx() self._n_features_outs = self._compute_n_features_outs() @@ -1023,7 +1023,7 @@ def transform(self, X): X_int, X_mask = self._transform( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", warn_on_unknown=warn_on_unknown, ) @@ -1495,7 +1495,7 @@ def fit(self, X, y=None): fit_results = self._fit( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", return_and_ignore_missing_for_infrequent=True, ) self._missing_indices = fit_results["missing_indices"] @@ -1577,7 +1577,7 @@ def transform(self, X): X_int, X_mask = self._transform( X, handle_unknown=self.handle_unknown, - force_all_finite="allow-nan", + ensure_all_finite="allow-nan", ignore_category_indices=self._missing_indices, ) X_trans = X_int.astype(self.dtype, copy=False) @@ -1606,7 +1606,7 @@ def inverse_transform(self, X): Inverse transformed array. """ check_is_fitted(self) - X = check_array(X, force_all_finite="allow-nan") + X = check_array(X, ensure_all_finite="allow-nan") n_samples, _ = X.shape n_features = len(self.categories_) diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index b3b7c3d5e7bd9..048e80261c276 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -322,7 +322,7 @@ def transform(self, X): Transformed input. """ X_ordinal, X_known_mask = self._transform( - X, handle_unknown="ignore", force_all_finite="allow-nan" + X, handle_unknown="ignore", ensure_all_finite="allow-nan" ) # If 'multiclass' multiply axis=1 by num of classes else keep shape the same @@ -353,7 +353,7 @@ def _fit_encodings_all(self, X, y): ) check_consistent_length(X, y) - self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan") + self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan") if self.target_type == "auto": accepted_target_types = ("binary", "multiclass", "continuous") @@ -383,7 +383,7 @@ def _fit_encodings_all(self, X, y): self.target_mean_ = np.mean(y, axis=0) X_ordinal, X_known_mask = self._transform( - X, handle_unknown="ignore", force_all_finite="allow-nan" + X, handle_unknown="ignore", ensure_all_finite="allow-nan" ) n_categories = np.fromiter( (len(category_for_feature) for category_for_feature in self.categories_), diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index 647f48204414a..83405da99c48e 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -208,7 +208,7 @@ def fit(self, X, y): # we need row slicing support for sparse matrices, but costly finiteness check # can be delegated to the base estimator. X, y = self._validate_data( - X, y, accept_sparse=["csr", "csc", "lil", "dok"], force_all_finite=False + X, y, accept_sparse=["csr", "csc", "lil", "dok"], ensure_all_finite=False ) self.base_estimator_ = clone(self.base_estimator) @@ -315,7 +315,7 @@ def predict(self, X): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.base_estimator_.predict(X) @@ -338,7 +338,7 @@ def predict_proba(self, X): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.base_estimator_.predict_proba(X) @@ -361,7 +361,7 @@ def decision_function(self, X): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.base_estimator_.decision_function(X) @@ -384,7 +384,7 @@ def predict_log_proba(self, X): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.base_estimator_.predict_log_proba(X) @@ -410,7 +410,7 @@ def score(self, X, y): X = self._validate_data( X, accept_sparse=True, - force_all_finite=False, + ensure_all_finite=False, reset=False, ) return self.base_estimator_.score(X, y) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 675b58f48a2f4..b7a99e0c4946a 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -238,7 +238,7 @@ def _fit( # _compute_missing_values_in_feature_mask will check for finite values and # compute the missing mask if the tree supports missing values check_X_params = dict( - dtype=DTYPE, accept_sparse="csc", force_all_finite=False + dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data( @@ -475,15 +475,15 @@ def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: if self._support_missing_values(X): - force_all_finite = "allow-nan" + ensure_all_finite = "allow-nan" else: - force_all_finite = True + ensure_all_finite = True X = self._validate_data( X, dtype=DTYPE, accept_sparse="csr", reset=False, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ) if issparse(X) and ( X.indices.dtype != np.intc or X.indptr.dtype != np.intc diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index ead7f416368e3..307e4269747b7 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -157,7 +157,7 @@ def is_multilabel(y): check_y_kwargs = dict( accept_sparse=True, allow_nd=True, - force_all_finite=False, + ensure_all_finite=False, ensure_2d=False, ensure_min_samples=0, ensure_min_features=0, @@ -320,7 +320,7 @@ def type_of_target(y, input_name=""): check_y_kwargs = dict( accept_sparse=True, allow_nd=True, - force_all_finite=False, + ensure_all_finite=False, ensure_2d=False, ensure_min_samples=0, ensure_min_features=0, diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5bde51ae514d9..77dbafb072989 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -152,7 +152,7 @@ def test_as_float_array(): def test_as_float_array_nan(X): X[5, 0] = np.nan X[6, 1] = np.nan - X_converted = as_float_array(X, force_all_finite="allow-nan") + X_converted = as_float_array(X, ensure_all_finite="allow-nan") assert_allclose_dense_sparse(X_converted, X) @@ -200,18 +200,19 @@ def test_ordering(): @pytest.mark.parametrize( - "value, force_all_finite", [(np.inf, False), (np.nan, "allow-nan"), (np.nan, False)] + "value, ensure_all_finite", + [(np.inf, False), (np.nan, "allow-nan"), (np.nan, False)], ) @pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) -def test_check_array_force_all_finite_valid(value, force_all_finite, retype): +def test_check_array_ensure_all_finite_valid(value, ensure_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(float)) X[0, 0] = value - X_checked = check_array(X, force_all_finite=force_all_finite, accept_sparse=True) + X_checked = check_array(X, ensure_all_finite=ensure_all_finite, accept_sparse=True) assert_allclose_dense_sparse(X, X_checked) @pytest.mark.parametrize( - "value, input_name, force_all_finite, match_msg", + "value, input_name, ensure_all_finite, match_msg", [ (np.inf, "", True, "Input contains infinity"), (np.inf, "X", True, "Input X contains infinity"), @@ -224,14 +225,14 @@ def test_check_array_force_all_finite_valid(value, force_all_finite, retype): np.nan, "", "allow-inf", - 'force_all_finite should be a bool or "allow-nan"', + "ensure_all_finite should be a bool or 'allow-nan'", ), (np.nan, "", 1, "Input contains NaN"), ], ) @pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) -def test_check_array_force_all_finiteinvalid( - value, input_name, force_all_finite, match_msg, retype +def test_check_array_ensure_all_finite_invalid( + value, input_name, ensure_all_finite, match_msg, retype ): X = retype(np.arange(4).reshape(2, 2).astype(np.float64)) X[0, 0] = value @@ -239,7 +240,7 @@ def test_check_array_force_all_finiteinvalid( check_array( X, input_name=input_name, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, accept_sparse=True, ) @@ -286,17 +287,17 @@ def test_check_array_links_to_imputer_doc_only_for_X(input_name, retype): assert extended_msg in ctx.value.args[0] -def test_check_array_force_all_finite_object(): +def test_check_array_ensure_all_finite_object(): X = np.array([["a", "b", np.nan]], dtype=object).T - X_checked = check_array(X, dtype=None, force_all_finite="allow-nan") + X_checked = check_array(X, dtype=None, ensure_all_finite="allow-nan") assert X is X_checked - X_checked = check_array(X, dtype=None, force_all_finite=False) + X_checked = check_array(X, dtype=None, ensure_all_finite=False) assert X is X_checked with pytest.raises(ValueError, match="Input contains NaN"): - check_array(X, dtype=None, force_all_finite=True) + check_array(X, dtype=None, ensure_all_finite=True) @pytest.mark.parametrize( @@ -317,14 +318,14 @@ def test_check_array_force_all_finite_object(): (np.array([[1, np.nan]], dtype=object), "cannot convert float NaN to integer"), ], ) -@pytest.mark.parametrize("force_all_finite", [True, False]) -def test_check_array_force_all_finite_object_unsafe_casting( - X, err_msg, force_all_finite +@pytest.mark.parametrize("ensure_all_finite", [True, False]) +def test_check_array_ensure_all_finite_object_unsafe_casting( + X, err_msg, ensure_all_finite ): # casting a float array containing NaN or inf to int dtype should - # raise an error irrespective of the force_all_finite parameter. + # raise an error irrespective of the ensure_all_finite parameter. with pytest.raises(ValueError, match=err_msg): - check_array(X, dtype=int, force_all_finite=force_all_finite) + check_array(X, dtype=int, ensure_all_finite=ensure_all_finite) def test_check_array_series_err_msg(): @@ -509,17 +510,17 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): X = pd.DataFrame(X_np, dtype=pd_dtype, columns=["a", "b", "c"]) # column c has no nans X["c"] = X["c"].astype("float") - X_checked = check_array(X, force_all_finite="allow-nan", dtype=dtype) + X_checked = check_array(X, ensure_all_finite="allow-nan", dtype=dtype) assert_allclose(X_checked, X_np) assert X_checked.dtype == expected_dtype - X_checked = check_array(X, force_all_finite=False, dtype=dtype) + X_checked = check_array(X, ensure_all_finite=False, dtype=dtype) assert_allclose(X_checked, X_np) assert X_checked.dtype == expected_dtype msg = "Input contains NaN" with pytest.raises(ValueError, match=msg): - check_array(X, force_all_finite=True) + check_array(X, ensure_all_finite=True) def test_check_array_panadas_na_support_series(): @@ -530,14 +531,14 @@ def test_check_array_panadas_na_support_series(): msg = "Input contains NaN" with pytest.raises(ValueError, match=msg): - check_array(X_int64, force_all_finite=True, ensure_2d=False) + check_array(X_int64, ensure_all_finite=True, ensure_2d=False) - X_out = check_array(X_int64, force_all_finite=False, ensure_2d=False) + X_out = check_array(X_int64, ensure_all_finite=False, ensure_2d=False) assert_allclose(X_out, [1, 2, np.nan]) assert X_out.dtype == np.float64 X_out = check_array( - X_int64, force_all_finite=False, ensure_2d=False, dtype=np.float32 + X_int64, ensure_all_finite=False, ensure_2d=False, dtype=np.float32 ) assert_allclose(X_out, [1, 2, np.nan]) assert X_out.dtype == np.float32 @@ -1995,7 +1996,7 @@ def test_pandas_array_returns_ndarray(input_values): dtype=None, ensure_2d=False, allow_nd=False, - force_all_finite=False, + ensure_all_finite=False, ) assert np.issubdtype(result.dtype.kind, np.floating) assert_allclose(result, input_values) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 228fbe76a25e1..e6c2cd58c2451 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -212,7 +212,9 @@ def assert_all_finite( ) -def as_float_array(X, *, copy=True, force_all_finite=True): +def as_float_array( + X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=True +): """Convert an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original @@ -229,13 +231,13 @@ def as_float_array(X, *, copy=True, force_all_finite=True): returned if X's dtype is not a floating point type. force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in X. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - - True: Force all values of X to be finite. - - False: accepts np.inf, np.nan, pd.NA in X. - - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot - be infinite. + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. @@ -243,6 +245,28 @@ def as_float_array(X, *, copy=True, force_all_finite=True): .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.20 + Accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + Returns ------- XT : {ndarray, sparse matrix} @@ -256,6 +280,14 @@ def as_float_array(X, *, copy=True, force_all_finite=True): >>> as_float_array(array) array([0., 0., 1., 2., 2.]) """ + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8", + FutureWarning, + ) + ensure_all_finite = force_all_finite + if isinstance(X, np.matrix) or ( not isinstance(X, np.ndarray) and not sp.issparse(X) ): @@ -264,7 +296,7 @@ def as_float_array(X, *, copy=True, force_all_finite=True): accept_sparse=["csr", "csc", "coo"], dtype=np.float64, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ensure_2d=False, ) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: @@ -514,7 +546,7 @@ def _ensure_sparse_format( accept_sparse, dtype, copy, - force_all_finite, + ensure_all_finite, accept_large_sparse, estimator_name=None, input_name="", @@ -542,7 +574,7 @@ def _ensure_sparse_format( Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. - force_all_finite : bool or 'allow-nan' + ensure_all_finite : bool or 'allow-nan' Whether to raise an error on np.inf, np.nan, pd.NA in X. The possibilities are: @@ -552,7 +584,7 @@ def _ensure_sparse_format( be infinite. .. versionadded:: 0.20 - ``force_all_finite`` accepts the string ``'allow-nan'``. + ``ensure_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` @@ -615,7 +647,7 @@ def _ensure_sparse_format( # force copy sparse_container = sparse_container.copy() - if force_all_finite: + if ensure_all_finite: if not hasattr(sparse_container, "data"): warnings.warn( f"Can't check {sparse_container.format} sparse matrix for nan or inf.", @@ -624,7 +656,7 @@ def _ensure_sparse_format( else: _assert_all_finite( sparse_container.data, - allow_nan=force_all_finite == "allow-nan", + allow_nan=ensure_all_finite == "allow-nan", estimator_name=estimator_name, input_name=input_name, ) @@ -718,7 +750,8 @@ def check_array( order=None, copy=False, force_writeable=False, - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, @@ -790,6 +823,28 @@ def check_array( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.20 + Accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + ensure_2d : bool, default=True Whether to raise a value error if array is not 2D. @@ -831,6 +886,14 @@ def check_array( >>> X_checked array([[1, 2, 3], [4, 5, 6]]) """ + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8", + FutureWarning, + ) + ensure_all_finite = force_all_finite + if isinstance(array, np.matrix): raise TypeError( "np.matrix is not supported. Please convert to a numpy array with " @@ -924,11 +987,10 @@ def is_sparse(dtype): # Since we converted here, we do not need to convert again later dtype = None - if force_all_finite not in (True, False, "allow-nan"): + if ensure_all_finite not in (True, False, "allow-nan"): raise ValueError( - 'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format( - force_all_finite - ) + "ensure_all_finite should be a bool or 'allow-nan'. Got " + f"{ensure_all_finite!r} instead." ) if dtype is not None and _is_numpy_namespace(xp): @@ -967,7 +1029,7 @@ def is_sparse(dtype): accept_sparse=accept_sparse, dtype=dtype, copy=copy, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, accept_large_sparse=accept_large_sparse, estimator_name=estimator_name, input_name=input_name, @@ -1054,12 +1116,12 @@ def is_sparse(dtype): % (array.ndim, estimator_name) ) - if force_all_finite: + if ensure_all_finite: _assert_all_finite( array, input_name=input_name, estimator_name=estimator_name, - allow_nan=force_all_finite == "allow-nan", + allow_nan=ensure_all_finite == "allow-nan", ) if copy: @@ -1155,7 +1217,8 @@ def check_X_y( order=None, copy=False, force_writeable=False, - force_all_finite=True, + force_all_finite="deprecated", + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, @@ -1217,14 +1280,13 @@ def check_X_y( .. versionadded:: 1.6 force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter - does not influence whether y can have np.inf, np.nan, pd.NA values. - The possibilities are: + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: - - True: Force all values of X to be finite. - - False: accepts np.inf, np.nan, pd.NA in X. - - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot - be infinite. + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. @@ -1232,6 +1294,28 @@ def check_X_y( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.20 + Accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + ensure_2d : bool, default=True Whether to raise a value error if X is not 2D. @@ -1292,6 +1376,14 @@ def check_X_y( f"{estimator_name} requires y to be passed, but the target y is None" ) + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8", + FutureWarning, + ) + ensure_all_finite = force_all_finite + X = check_array( X, accept_sparse=accept_sparse, @@ -1300,7 +1392,7 @@ def check_X_y( order=order, copy=copy, force_writeable=force_writeable, - force_all_finite=force_all_finite, + ensure_all_finite=ensure_all_finite, ensure_2d=ensure_2d, allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, @@ -1322,7 +1414,7 @@ def _check_y(y, multi_output=False, y_numeric=False, estimator=None): y = check_array( y, accept_sparse="csr", - force_all_finite=True, + ensure_all_finite=True, ensure_2d=False, dtype=None, input_name="y", @@ -1377,7 +1469,7 @@ def column_or_1d(y, *, dtype=None, warn=False): ensure_2d=False, dtype=dtype, input_name="y", - force_all_finite=False, + ensure_all_finite=False, ensure_min_samples=0, ) From 4432e7068e3197438460d449c0c1da40cdd73e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 3 Jul 2024 16:23:17 +0200 Subject: [PATCH 02/12] changelog --- doc/whats_new/v1.6.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index c201d20cbb16a..b7e31df6fe1f9 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -170,6 +170,11 @@ Changelog :pr:`29210` by :user:`Marc Torrellas Socastro ` and :user:`Stefanie Senger `. +- |API| the `assert_all_finite` parameter of functions + :func:`metrics.pairwise.check_pairwise_arrays` and :func:`metrics.pairwise_distances` + is renamed into `ensure_all_finite`. `force_all_finite` will be removed in 1.8. + :pr:`29404` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.model_selection` .............................. @@ -185,6 +190,14 @@ Changelog when duplicate values in the training data lead to inaccurate outlier detection. :pr:`28773` by :user:`Henrique Caroço `. +:mod:`sklearn.utils` +.................... + +- |API| the `assert_all_finite` parameter of functions :func:`utils.check_array`, + :func:`utils.check_X_y`, :func:`utils.as_float_array` is renamed into + `ensure_all_finite`. `force_all_finite` will be removed in 1.8. + :pr:`29404` by :user:`Jérémie du Boisberranger `. + Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: From 7c0a41cc2ed5550485991e106b2898b3aa8c9146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 3 Jul 2024 17:51:12 +0200 Subject: [PATCH 03/12] filter warnings in lightgbm compare test --- .../tests/test_compare_lightgbm.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index bbdcb38ef013a..24b5b02aa0696 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -12,6 +12,10 @@ from sklearn.model_selection import train_test_split +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize( "loss", @@ -118,6 +122,10 @@ def test_same_predictions_regression( assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-4)) > 1 - 0.01 +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("min_samples_leaf", (1, 20)) @pytest.mark.parametrize( @@ -191,6 +199,10 @@ def test_same_predictions_classification( np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("min_samples_leaf", (1, 20)) @pytest.mark.parametrize( From f06e91419313c7716a41dcd052a4513688c5d084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 10 Jul 2024 14:01:51 +0200 Subject: [PATCH 04/12] add tests for deprecation messages --- sklearn/metrics/tests/test_pairwise.py | 14 ++++++++++++++ sklearn/utils/tests/test_validation.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 03d22e0f6d344..9375379428321 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1666,3 +1666,17 @@ def test_sparse_manhattan_readonly_dataset(csr_container): Parallel(n_jobs=2, max_nbytes=0)( delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) ) + + +# TODO(1.8): remove +def test_force_all_finite_rename_warning(): + X = np.random.uniform(size=(10, 10)) + Y = np.random.uniform(size=(10, 10)) + + msg = "'force_all_finite' was renamed to 'ensure_all_finite'" + + with pytest.warns(FutureWarning, match=msg): + check_pairwise_arrays(X, Y, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + pairwise_distances(X, Y, force_all_finite=True) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 77dbafb072989..5b7e890adbb4b 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -2190,3 +2190,17 @@ def test_check_array_writeable_df(): # df is backed by a read-only array, a copy is made assert not np.may_share_memory(out, df) assert out.flags.writeable + + +# TODO(1.8): remove +def test_force_all_finite_rename_warning(): + X = np.random.uniform(size=(10, 10)) + y = np.random.randint(1, size=(10,)) + + msg = "'force_all_finite' was renamed to 'ensure_all_finite'" + + with pytest.warns(FutureWarning, match=msg): + check_array(X, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + check_array(X, y, force_all_finite=True) From ce2074c8f03990632fda6c9a607342008366004b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 10 Jul 2024 14:43:45 +0200 Subject: [PATCH 05/12] fix test --- sklearn/utils/tests/test_validation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5b7e890adbb4b..0177c60fe63e8 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -2203,4 +2203,7 @@ def test_force_all_finite_rename_warning(): check_array(X, force_all_finite=True) with pytest.warns(FutureWarning, match=msg): - check_array(X, y, force_all_finite=True) + check_X_y(X, y, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + as_float_array(X, force_all_finite=True) From 84987d9b9afcafdd138263760b92f73d87e9d520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 10 Jul 2024 14:51:03 +0200 Subject: [PATCH 06/12] more explicit warning --- sklearn/metrics/pairwise.py | 6 ++++-- sklearn/utils/validation.py | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 42c2fb96db7ad..52930cca84cb5 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -181,7 +181,8 @@ def check_pairwise_arrays( if force_all_finite != "deprecated": warnings.warn( "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8", + "removed in 1.8. Until then, ensure_all_finite is ignored when " + "force_all_finite is set.", FutureWarning, ) ensure_all_finite = force_all_finite @@ -2399,7 +2400,8 @@ def pairwise_distances( if force_all_finite != "deprecated": warnings.warn( "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8", + "removed in 1.8. Until then, ensure_all_finite is ignored when " + "force_all_finite is set.", FutureWarning, ) ensure_all_finite = force_all_finite diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index c87391510bfdf..3e290fc8fa5f9 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -283,7 +283,8 @@ def as_float_array( if force_all_finite != "deprecated": warnings.warn( "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8", + "removed in 1.8. Until then, ensure_all_finite is ignored when " + "force_all_finite is set.", FutureWarning, ) ensure_all_finite = force_all_finite @@ -889,7 +890,8 @@ def check_array( if force_all_finite != "deprecated": warnings.warn( "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8", + "removed in 1.8. Until then, ensure_all_finite is ignored when " + "force_all_finite is set.", FutureWarning, ) ensure_all_finite = force_all_finite @@ -1379,7 +1381,8 @@ def check_X_y( if force_all_finite != "deprecated": warnings.warn( "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8", + "removed in 1.8. Until then, ensure_all_finite is ignored when " + "force_all_finite is set.", FutureWarning, ) ensure_all_finite = force_all_finite From f7fdcfd8970dc683f2516846eacccad6c365a12f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Wed, 10 Jul 2024 15:09:18 +0200 Subject: [PATCH 07/12] update latest occurences --- sklearn/ensemble/_iforest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index b8df148f20a1a..f4694d69e2dbb 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -316,7 +316,7 @@ def fit(self, X, y=None, sample_weight=None): Fitted estimator. """ X = self._validate_data( - X, accept_sparse=["csc"], dtype=tree_dtype, force_all_finite=False + X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False ) if issparse(X): # Pre-sort indices to avoid that each individual tree of the @@ -522,7 +522,7 @@ def score_samples(self, X): accept_sparse="csr", dtype=tree_dtype, reset=False, - force_all_finite=False, + ensure_all_finite=False, ) return self._score_samples(X) From e0fdbf6ab7b04b7a04315fd08bd13b780443af9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Thu, 18 Jul 2024 18:51:46 +0200 Subject: [PATCH 08/12] cln bad copy paste --- sklearn/metrics/pairwise.py | 8 +++---- sklearn/utils/validation.py | 46 +++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 52930cca84cb5..dbd783d2409dd 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -128,11 +128,11 @@ def check_pairwise_arrays( - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. - .. versionadded:: 0.20 + .. versionadded:: 0.22 ``force_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` + Accepts `pd.NA` and converts it into `np.nan`. .. deprecated:: 1.6 `force_all_finite` was renamed to `ensure_all_finite` and will be removed @@ -147,7 +147,7 @@ def check_pairwise_arrays( - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. - .. versionadded:: 0.20 + .. versionadded:: 0.22 Accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 @@ -2342,7 +2342,7 @@ def pairwise_distances( ``force_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` + Accepts `pd.NA` and converts it into `np.nan`. .. deprecated:: 1.6 `force_all_finite` was renamed to `ensure_all_finite` and will be removed diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3e290fc8fa5f9..ccdd409b190c8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -231,13 +231,13 @@ def as_float_array( returned if X's dtype is not a floating point type. force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in X. The possibilities are: - - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. Values - cannot be infinite. + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. @@ -250,13 +250,13 @@ def as_float_array( in 1.8. ensure_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in X. The possibilities are: - - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. Values - cannot be infinite. + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 Accepts the string ``'allow-nan'``. @@ -1282,13 +1282,14 @@ def check_X_y( .. versionadded:: 1.6 force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. The - possibilities are: + Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter + does not influence whether y can have np.inf, np.nan, pd.NA values. + The possibilities are: - - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. Values - cannot be infinite. + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. @@ -1301,13 +1302,14 @@ def check_X_y( in 1.8. ensure_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. The - possibilities are: + Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter + does not influence whether y can have np.inf, np.nan, pd.NA values. + The possibilities are: - - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. Values - cannot be infinite. + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 Accepts the string ``'allow-nan'``. From f3a67569151a6057833b6c7ea0bae4b7d5dd96bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Thu, 18 Jul 2024 18:56:21 +0200 Subject: [PATCH 09/12] iter --- sklearn/metrics/pairwise.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index dbd783d2409dd..2f61c101cebe0 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -2330,7 +2330,8 @@ def pairwise_distances( and quickly degrade performance. force_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored + for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The possibilities are: - True: Force all values of array to be finite. @@ -2349,7 +2350,8 @@ def pairwise_distances( in 1.8. ensure_all_finite : bool or 'allow-nan', default=True - Whether to raise an error on np.inf, np.nan, pd.NA in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored + for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The possibilities are: - True: Force all values of array to be finite. From c9dba755643290ecd79c4f1259b3862140007734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Thu, 25 Jul 2024 15:16:46 +0200 Subject: [PATCH 10/12] raise when both are set --- sklearn/metrics/pairwise.py | 37 +++++-------------------- sklearn/utils/deprecation.py | 24 +++++++++++++++++ sklearn/utils/validation.py | 52 +++++------------------------------- 3 files changed, 37 insertions(+), 76 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 08a7dc9eb9d65..7e5bbe2e52ed4 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -42,6 +42,7 @@ StrOptions, validate_params, ) +from ..utils.deprecation import _deprecate_force_all_finite from ..utils.extmath import row_norms, safe_sparse_dot from ..utils.fixes import parse_version, sp_base_version from ..utils.parallel import Parallel, delayed @@ -83,7 +84,7 @@ def check_pairwise_arrays( dtype="infer_float", accept_sparse="csr", force_all_finite="deprecated", - ensure_all_finite=True, + ensure_all_finite=None, ensure_2d=True, copy=False, ): @@ -152,12 +153,6 @@ def check_pairwise_arrays( - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. - .. versionadded:: 0.22 - Accepts the string ``'allow-nan'``. - - .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` - .. versionadded:: 1.6 `force_all_finite` was renamed to `ensure_all_finite`. @@ -183,14 +178,7 @@ def check_pairwise_arrays( An array equal to Y if Y was not None, guaranteed to be a numpy array. If Y was None, safe_Y will be a pointer to X. """ - if force_all_finite != "deprecated": - warnings.warn( - "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8. Until then, ensure_all_finite is ignored when " - "force_all_finite is set.", - FutureWarning, - ) - ensure_all_finite = force_all_finite + ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) xp, _ = get_namespace(X, Y) if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp): @@ -2258,7 +2246,7 @@ def pairwise_distances_chunked( StrOptions({"allow-nan"}), Hidden(StrOptions({"deprecated"})), ], - "ensure_all_finite": ["boolean", StrOptions({"allow-nan"})], + "ensure_all_finite": ["boolean", StrOptions({"allow-nan"}), Hidden(None)], }, prefer_skip_nested_validation=True, ) @@ -2269,7 +2257,7 @@ def pairwise_distances( *, n_jobs=None, force_all_finite="deprecated", - ensure_all_finite=True, + ensure_all_finite=None, **kwds, ): """Compute the distance matrix from a vector array X and optional Y. @@ -2383,12 +2371,6 @@ def pairwise_distances( - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. - .. versionadded:: 0.22 - Accepts the string ``'allow-nan'``. - - .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` - .. versionadded:: 1.6 `force_all_finite` was renamed to `ensure_all_finite`. @@ -2423,14 +2405,7 @@ def pairwise_distances( array([[1., 2.], [2., 1.]]) """ - if force_all_finite != "deprecated": - warnings.warn( - "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8. Until then, ensure_all_finite is ignored when " - "force_all_finite is set.", - FutureWarning, - ) - ensure_all_finite = force_all_finite + ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) if metric == "precomputed": X, _ = check_pairwise_arrays( diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py index b3eca8053b94a..b46826fd81a64 100644 --- a/sklearn/utils/deprecation.py +++ b/sklearn/utils/deprecation.py @@ -136,3 +136,27 @@ def _deprecate_Xt_in_inverse_transform(X, Xt): return Xt return X + + +# TODO(1.8): remove force_all_finite and change the default value of ensure_all_finite +# to True (remove None without deprecation). +def _deprecate_force_all_finite(force_all_finite, ensure_all_finite): + """Helper to deprecate force_all_finite in favor of ensure_all_finite.""" + if force_all_finite != "deprecated": + warnings.warn( + "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " + "removed in 1.8.", + FutureWarning, + ) + + if ensure_all_finite is not None: + raise ValueError( + "'force_all_finite' and 'ensure_all_finite' cannot be used together." + ) + + return force_all_finite + + if ensure_all_finite is None: + return True + + return ensure_all_finite diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index ccdd409b190c8..5cef505cb936e 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -18,6 +18,7 @@ from .. import get_config as _get_config from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace +from ..utils.deprecation import _deprecate_force_all_finite from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype from ._isfinite import FiniteStatus, cy_isfinite from .fixes import _object_dtype_isnan @@ -213,7 +214,7 @@ def assert_all_finite( def as_float_array( - X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=True + X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=None ): """Convert an array-like to an array of floats. @@ -258,12 +259,6 @@ def as_float_array( - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot be infinite. - .. versionadded:: 0.20 - Accepts the string ``'allow-nan'``. - - .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` - .. versionadded:: 1.6 `force_all_finite` was renamed to `ensure_all_finite`. @@ -280,14 +275,7 @@ def as_float_array( >>> as_float_array(array) array([0., 0., 1., 2., 2.]) """ - if force_all_finite != "deprecated": - warnings.warn( - "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8. Until then, ensure_all_finite is ignored when " - "force_all_finite is set.", - FutureWarning, - ) - ensure_all_finite = force_all_finite + ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) if isinstance(X, np.matrix) or ( not isinstance(X, np.ndarray) and not sp.issparse(X) @@ -752,7 +740,7 @@ def check_array( copy=False, force_writeable=False, force_all_finite="deprecated", - ensure_all_finite=True, + ensure_all_finite=None, ensure_2d=True, allow_nd=False, ensure_min_samples=1, @@ -837,12 +825,6 @@ def check_array( - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. - .. versionadded:: 0.20 - Accepts the string ``'allow-nan'``. - - .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` - .. versionadded:: 1.6 `force_all_finite` was renamed to `ensure_all_finite`. @@ -887,14 +869,7 @@ def check_array( >>> X_checked array([[1, 2, 3], [4, 5, 6]]) """ - if force_all_finite != "deprecated": - warnings.warn( - "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8. Until then, ensure_all_finite is ignored when " - "force_all_finite is set.", - FutureWarning, - ) - ensure_all_finite = force_all_finite + ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) if isinstance(array, np.matrix): raise TypeError( @@ -1220,7 +1195,7 @@ def check_X_y( copy=False, force_writeable=False, force_all_finite="deprecated", - ensure_all_finite=True, + ensure_all_finite=None, ensure_2d=True, allow_nd=False, multi_output=False, @@ -1311,12 +1286,6 @@ def check_X_y( - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot be infinite. - .. versionadded:: 0.20 - Accepts the string ``'allow-nan'``. - - .. versionchanged:: 0.23 - Accepts `pd.NA` and converts it into `np.nan` - .. versionadded:: 1.6 `force_all_finite` was renamed to `ensure_all_finite`. @@ -1380,14 +1349,7 @@ def check_X_y( f"{estimator_name} requires y to be passed, but the target y is None" ) - if force_all_finite != "deprecated": - warnings.warn( - "'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be " - "removed in 1.8. Until then, ensure_all_finite is ignored when " - "force_all_finite is set.", - FutureWarning, - ) - ensure_all_finite = force_all_finite + ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) X = check_array( X, From ab1d4855ec58863680946dd16e5961323da3b578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Thu, 25 Jul 2024 15:28:06 +0200 Subject: [PATCH 11/12] iter --- sklearn/metrics/pairwise.py | 4 ++-- sklearn/utils/validation.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 7e5bbe2e52ed4..745767f23e818 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -178,7 +178,7 @@ def check_pairwise_arrays( An array equal to Y if Y was not None, guaranteed to be a numpy array. If Y was None, safe_Y will be a pointer to X. """ - ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) xp, _ = get_namespace(X, Y) if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp): @@ -2405,7 +2405,7 @@ def pairwise_distances( array([[1., 2.], [2., 1.]]) """ - ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) if metric == "precomputed": X, _ = check_pairwise_arrays( diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5cef505cb936e..4bbd922d28e37 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -275,7 +275,7 @@ def as_float_array( >>> as_float_array(array) array([0., 0., 1., 2., 2.]) """ - ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) if isinstance(X, np.matrix) or ( not isinstance(X, np.ndarray) and not sp.issparse(X) @@ -869,7 +869,7 @@ def check_array( >>> X_checked array([[1, 2, 3], [4, 5, 6]]) """ - ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) if isinstance(array, np.matrix): raise TypeError( @@ -1349,7 +1349,7 @@ def check_X_y( f"{estimator_name} requires y to be passed, but the target y is None" ) - ensure_all_finite = _deprecate_force_all_finite(ensure_all_finite, force_all_finite) + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) X = check_array( X, From 36d968e0b5db3a8164bdacc11b17268a5cd8a42b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Thu, 25 Jul 2024 16:32:07 +0200 Subject: [PATCH 12/12] Update sklearn/utils/deprecation.py Co-authored-by: Adrin Jalali --- sklearn/utils/deprecation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py index b46826fd81a64..ff08ec2aceb81 100644 --- a/sklearn/utils/deprecation.py +++ b/sklearn/utils/deprecation.py @@ -151,7 +151,8 @@ def _deprecate_force_all_finite(force_all_finite, ensure_all_finite): if ensure_all_finite is not None: raise ValueError( - "'force_all_finite' and 'ensure_all_finite' cannot be used together." + "'force_all_finite' and 'ensure_all_finite' cannot be used together. " + "Pass `ensure_all_finite` only." ) return force_all_finite