Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions sklearn/datasets/_arff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def _io_to_generator(gzip_file):

# calculate chunksize
first_row = next(arff_container["data"])
first_df = pd.DataFrame([first_row], columns=columns_names)
first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)

row_bytes = first_df.memory_usage(deep=True).sum()
chunksize = get_chunk_n_rows(row_bytes)
Expand All @@ -196,7 +196,9 @@ def _io_to_generator(gzip_file):
columns_to_keep = [col for col in columns_names if col in columns_to_select]
dfs = [first_df[columns_to_keep]]
for data in _chunk_generator(arff_container["data"], chunksize):
dfs.append(pd.DataFrame(data, columns=columns_names)[columns_to_keep])
dfs.append(
pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
)
frame = pd.concat(dfs, ignore_index=True)
del dfs, first_df

Expand Down
2 changes: 1 addition & 1 deletion sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _convert_data_dataframe(
):
pd = check_pandas_support("{} with as_frame=True".format(caller_name))
if not sparse_data:
data_df = pd.DataFrame(data, columns=feature_names)
data_df = pd.DataFrame(data, columns=feature_names, copy=False)
else:
data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)

Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _wrap_in_pandas_container(
data_to_wrap.columns = columns
return data_to_wrap

return pd.DataFrame(data_to_wrap, index=index, columns=columns)
return pd.DataFrame(data_to_wrap, index=index, columns=columns, copy=False)


def _get_output_config(method, estimator=None):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ def _convert_container(container, constructor_name, columns_name=None, dtype=Non
return sp.sparse.csr_matrix(container, dtype=dtype)
elif constructor_name == "dataframe":
pd = pytest.importorskip("pandas")
return pd.DataFrame(container, columns=columns_name, dtype=dtype)
return pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False)
elif constructor_name == "series":
pd = pytest.importorskip("pandas")
return pd.Series(container, dtype=dtype)
Expand Down
22 changes: 11 additions & 11 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,11 +925,11 @@ def check_sample_weights_pandas_series(name, estimator_orig):
[3, 4],
]
)
X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X))
X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False)
y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
weights = pd.Series([1] * 12)
if _safe_tags(estimator, key="multioutput_only"):
y = pd.DataFrame(y)
y = pd.DataFrame(y, copy=False)
try:
estimator.fit(X, y, sample_weight=weights)
except ValueError:
Expand Down Expand Up @@ -3218,10 +3218,10 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):

y_ = np.asarray(y)
if y_.ndim == 1:
y_ = pd.Series(y_)
y_ = pd.Series(y_, copy=False)
else:
y_ = pd.DataFrame(y_)
X_ = pd.DataFrame(np.asarray(X))
y_ = pd.DataFrame(y_, copy=False)
X_ = pd.DataFrame(np.asarray(X), copy=False)

except ImportError:
raise SkipTest(
Expand Down Expand Up @@ -3897,7 +3897,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
n_samples, n_features = X_orig.shape

names = np.array([f"col_{i}" for i in range(n_features)])
X = pd.DataFrame(X_orig, columns=names)
X = pd.DataFrame(X_orig, columns=names, copy=False)

if is_regressor(estimator):
y = rng.normal(size=n_samples)
Expand Down Expand Up @@ -3985,7 +3985,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
early_stopping_enabled = any(value is True for value in params.values())

for invalid_name, additional_message in invalid_names:
X_bad = pd.DataFrame(X, columns=invalid_name)
X_bad = pd.DataFrame(X, columns=invalid_name, copy=False)

expected_msg = re.escape(
"The feature names should match those that were passed during fit.\n"
Expand Down Expand Up @@ -4094,7 +4094,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig):
y_[::2, 1] *= 2

feature_names_in = [f"col{i}" for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names_in)
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
X_transform = transformer.fit_transform(df, y=y_)

# error is raised when `input_features` do not match feature_names_in
Expand Down Expand Up @@ -4324,7 +4324,7 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
# We always rely on the output of `get_feature_names_out` of the
# transformer used to generate the dataframe as a ground-truth of the
# columns.
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas)
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False)

try:
pd.testing.assert_frame_equal(df_trans, expected_dataframe)
Expand Down Expand Up @@ -4359,7 +4359,7 @@ def check_set_output_transform_pandas(name, transformer_orig):
set_random_state(transformer)

feature_names_in = [f"col{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names_in)
df = pd.DataFrame(X, columns=feature_names_in, copy=False)

transformer_default = clone(transformer).set_output(transform="default")
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
Expand Down Expand Up @@ -4401,7 +4401,7 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
set_random_state(transformer)

feature_names_in = [f"col{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names_in)
df = pd.DataFrame(X, columns=feature_names_in, copy=False)

transformer_default = clone(transformer).set_output(transform="default")
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
Expand Down