From 33303ea5aa366dabbaf012a4e4ddbe3f612f4d60 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 24 Apr 2023 08:46:52 -0400 Subject: [PATCH] MNT Use copy=False when creating DataFrames --- sklearn/datasets/_arff_parser.py | 6 ++++-- sklearn/datasets/_base.py | 2 +- sklearn/utils/_set_output.py | 2 +- sklearn/utils/_testing.py | 2 +- sklearn/utils/estimator_checks.py | 22 +++++++++++----------- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 2ec61f5817db1..d085e9d85f2a9 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -187,7 +187,7 @@ def _io_to_generator(gzip_file): # calculate chunksize first_row = next(arff_container["data"]) - first_df = pd.DataFrame([first_row], columns=columns_names) + first_df = pd.DataFrame([first_row], columns=columns_names, copy=False) row_bytes = first_df.memory_usage(deep=True).sum() chunksize = get_chunk_n_rows(row_bytes) @@ -196,7 +196,9 @@ def _io_to_generator(gzip_file): columns_to_keep = [col for col in columns_names if col in columns_to_select] dfs = [first_df[columns_to_keep]] for data in _chunk_generator(arff_container["data"], chunksize): - dfs.append(pd.DataFrame(data, columns=columns_names)[columns_to_keep]) + dfs.append( + pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep] + ) frame = pd.concat(dfs, ignore_index=True) del dfs, first_df diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index bcd163caf738d..af7e3cdaaf7ff 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -86,7 +86,7 @@ def _convert_data_dataframe( ): pd = check_pandas_support("{} with as_frame=True".format(caller_name)) if not sparse_data: - data_df = pd.DataFrame(data, columns=feature_names) + data_df = pd.DataFrame(data, columns=feature_names, copy=False) else: data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names) diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index ab4f558e1c2e3..8071544091fca 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -57,7 +57,7 @@ def _wrap_in_pandas_container( data_to_wrap.columns = columns return data_to_wrap - return pd.DataFrame(data_to_wrap, index=index, columns=columns) + return pd.DataFrame(data_to_wrap, index=index, columns=columns, copy=False) def _get_output_config(method, estimator=None): diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index efd5aaee40efb..20d0ef09e27a7 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -843,7 +843,7 @@ def _convert_container(container, constructor_name, columns_name=None, dtype=Non return sp.sparse.csr_matrix(container, dtype=dtype) elif constructor_name == "dataframe": pd = pytest.importorskip("pandas") - return pd.DataFrame(container, columns=columns_name, dtype=dtype) + return pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False) elif constructor_name == "series": pd = pytest.importorskip("pandas") return pd.Series(container, dtype=dtype) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4575d29545735..75e47c6c98897 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -925,11 +925,11 @@ def check_sample_weights_pandas_series(name, estimator_orig): [3, 4], ] ) - X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X)) + X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False) y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = pd.Series([1] * 12) if _safe_tags(estimator, key="multioutput_only"): - y = pd.DataFrame(y) + y = pd.DataFrame(y, copy=False) try: estimator.fit(X, y, sample_weight=weights) except ValueError: @@ -3218,10 +3218,10 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type): y_ = np.asarray(y) if y_.ndim == 1: - y_ = pd.Series(y_) + y_ = pd.Series(y_, copy=False) else: - y_ = pd.DataFrame(y_) - X_ = pd.DataFrame(np.asarray(X)) + y_ = pd.DataFrame(y_, copy=False) + X_ = pd.DataFrame(np.asarray(X), copy=False) except ImportError: raise SkipTest( @@ -3897,7 +3897,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): n_samples, n_features = X_orig.shape names = np.array([f"col_{i}" for i in range(n_features)]) - X = pd.DataFrame(X_orig, columns=names) + X = pd.DataFrame(X_orig, columns=names, copy=False) if is_regressor(estimator): y = rng.normal(size=n_samples) @@ -3985,7 +3985,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): early_stopping_enabled = any(value is True for value in params.values()) for invalid_name, additional_message in invalid_names: - X_bad = pd.DataFrame(X, columns=invalid_name) + X_bad = pd.DataFrame(X, columns=invalid_name, copy=False) expected_msg = re.escape( "The feature names should match those that were passed during fit.\n" @@ -4094,7 +4094,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): y_[::2, 1] *= 2 feature_names_in = [f"col{i}" for i in range(n_features)] - df = pd.DataFrame(X, columns=feature_names_in) + df = pd.DataFrame(X, columns=feature_names_in, copy=False) X_transform = transformer.fit_transform(df, y=y_) # error is raised when `input_features` do not match feature_names_in @@ -4324,7 +4324,7 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): # We always rely on the output of `get_feature_names_out` of the # transformer used to generate the dataframe as a ground-truth of the # columns. - expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas) + expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False) try: pd.testing.assert_frame_equal(df_trans, expected_dataframe) @@ -4359,7 +4359,7 @@ def check_set_output_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in) + df = pd.DataFrame(X, columns=feature_names_in, copy=False) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) @@ -4401,7 +4401,7 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in) + df = pd.DataFrame(X, columns=feature_names_in, copy=False) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)