-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
ENH Adds array_out="pandas" to transformers in preprocessing module #20100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -23,6 +23,7 @@ | |||||
from .utils.validation import check_array | ||||||
from .utils.validation import _num_features | ||||||
from .utils._estimator_html_repr import estimator_html_repr | ||||||
from .utils._array_out import _get_feature_names | ||||||
|
||||||
|
||||||
def clone(estimator, *, safe=True): | ||||||
|
@@ -376,6 +377,33 @@ def _check_n_features(self, X, reset): | |||||
f"X has {n_features} features, but {self.__class__.__name__} " | ||||||
f"is expecting {self.n_features_in_} features as input.") | ||||||
|
||||||
def _check_feature_names(self, X, reset=True): | ||||||
"""Set the `feature_names_in_` attribute, or check against it. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
X : array-like | ||||||
The input samples. | ||||||
reset : bool, default=True | ||||||
If True, the `n_feature_names_` attribute is set to the feature | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
names of `X`. | ||||||
Else, the attribute must already exist and the function checks | ||||||
that it is equal to the feature names of `X`. | ||||||
""" | ||||||
feature_names = _get_feature_names(X) | ||||||
if reset: | ||||||
self.feature_names_in_ = feature_names | ||||||
return | ||||||
|
||||||
if (not hasattr(self, 'feature_names_in_') or | ||||||
self.feature_names_in_ is None or | ||||||
feature_names is None): | ||||||
return | ||||||
|
||||||
if any(feature_names != self.feature_names_in_): | ||||||
raise ValueError("The input's feature names does not match the " | ||||||
"feature_names_in_ attribute.") | ||||||
|
||||||
def _validate_data(self, X, y='no_validation', reset=True, | ||||||
validate_separately=False, **check_params): | ||||||
"""Validate input data and set or check the `n_features_in_` attribute. | ||||||
|
@@ -418,6 +446,7 @@ def _validate_data(self, X, y='no_validation', reset=True, | |||||
out : {ndarray, sparse matrix} or tuple of these | ||||||
The validated input. A tuple is returned if `y` is not None. | ||||||
""" | ||||||
self._check_feature_names(X, reset=reset) | ||||||
|
||||||
if y is None: | ||||||
if self._get_tags()['requires_y']: | ||||||
|
@@ -678,7 +707,7 @@ def get_submatrix(self, i, data): | |||||
class TransformerMixin: | ||||||
"""Mixin class for all transformers in scikit-learn.""" | ||||||
|
||||||
def fit_transform(self, X, y=None, **fit_params): | ||||||
def fit_transform(self, X, y=None, array_out="default", **fit_params): | ||||||
""" | ||||||
Fit to data, then transform it. | ||||||
|
||||||
|
@@ -694,6 +723,11 @@ def fit_transform(self, X, y=None, **fit_params): | |||||
default=None | ||||||
Target values (None for unsupervised transformations). | ||||||
|
||||||
array_out : {"default", "pandas"}, default="default" | ||||||
Specify the output array type. If "pandas", a pandas DataFrame is | ||||||
returned. If "default", an array-like without feature names is | ||||||
returned. | ||||||
Comment on lines
+728
to
+729
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's probably worth explaining why this is not a numpy array, and sometimes a sparse array. |
||||||
|
||||||
**fit_params : dict | ||||||
Additional fit parameters. | ||||||
|
||||||
|
@@ -706,10 +740,20 @@ def fit_transform(self, X, y=None, **fit_params): | |||||
# method is possible for a given clustering algorithm | ||||||
if y is None: | ||||||
# fit method of arity 1 (unsupervised transformation) | ||||||
return self.fit(X, **fit_params).transform(X) | ||||||
fitted = self.fit(X, **fit_params) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that also confused me. I think it's always self, right? Or at least should be. |
||||||
else: | ||||||
# fit method of arity 2 (supervised transformation) | ||||||
return self.fit(X, y, **fit_params).transform(X) | ||||||
fitted = self.fit(X, y, **fit_params) | ||||||
|
||||||
if array_out == "default": | ||||||
return fitted.transform(X) | ||||||
|
||||||
# array_out != "default" | ||||||
transform_params = inspect.signature(fitted.transform).parameters | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we not just pass it, or is the error message not good in that case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you mean "can we just pass it"? If we don't pass it when it doesn't exist, it's a silent bug, and I think just passing it gives an error message which can be confusing to many people. |
||||||
if "array_out" not in transform_params: | ||||||
raise ValueError("Transform does not support array_out") | ||||||
|
||||||
return fitted.transform(X, array_out=array_out) | ||||||
|
||||||
|
||||||
class DensityMixin: | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is
array_out
a good name? that seems to imply... arrays.output_format
? oroutput
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd be happy with either
output
oroutput_format