Skip to content

ENH add auto inference based on pd.CategoricalDtype in SMOTENC #1009

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/over_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ which categorical data are treated differently::
In this data set, the first and last features are considered as categorical
features. One needs to provide this information to :class:`SMOTENC` via the
parameters ``categorical_features`` either by passing the indices, the feature
names when `X` is a pandas DataFrame, or a boolean mask marking these features::
names when `X` is a pandas DataFrame, a boolean mask marking these features,
or relying on `dtype` inference if the columns are using the
:class:`pandas.CategoricalDtype`::

>>> from imblearn.over_sampling import SMOTENC
>>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
Expand Down
7 changes: 7 additions & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,10 @@ Enhancements
- :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
when passing the `categorical_features` parameter.
:pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
<<<<<<< HEAD

- :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference
when `categorical_features` is set to `"auto"`.
:pr:`1009` by :user`Guillaume Lemaitre <glemaitre>`.
=======
>>>>>>> origin/master
45 changes: 34 additions & 11 deletions imblearn/over_sampling/_smote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
from ...metrics.pairwise import ValueDifferenceMetric
from ...utils import Substitution, check_neighbors_object, check_target_type
from ...utils._docstring import _n_jobs_docstring, _random_state_docstring
from ...utils._param_validation import HasMethods, Interval
from ...utils._param_validation import HasMethods, Interval, StrOptions
from ...utils._validation import _check_X
from ...utils.fixes import _mode
from ...utils.fixes import _is_pandas_df, _mode
from ..base import BaseOverSampler


Expand Down Expand Up @@ -395,10 +395,13 @@ class SMOTENC(SMOTE):

Parameters
----------
categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
dtype={{bool, int, str}}
categorical_features : "infer" or array-like of shape (n_cat_features,) or \
(n_features,), dtype={{bool, int, str}}
Specified which features are categorical. Can either be:

- "auto" (default) to automatically detect categorical features. Only
supported when `X` is a :class:`pandas.DataFrame` and it corresponds
to columns that have a :class:`pandas.CategoricalDtype`;
- array of `int` corresponding to the indices specifying the categorical
features;
- array of `str` corresponding to the feature names. `X` should be a pandas
Expand Down Expand Up @@ -538,7 +541,7 @@ class SMOTENC(SMOTE):

_parameter_constraints: dict = {
**SMOTE._parameter_constraints,
"categorical_features": ["array-like"],
"categorical_features": ["array-like", StrOptions({"auto"})],
"categorical_encoder": [
HasMethods(["fit_transform", "inverse_transform"]),
None,
Expand Down Expand Up @@ -575,12 +578,27 @@ def _check_X_y(self, X, y):
return X, y, binarize_y

def _validate_column_types(self, X):
self.categorical_features_ = np.array(
_get_column_indices(X, self.categorical_features)
)
self.continuous_features_ = np.setdiff1d(
np.arange(self.n_features_), self.categorical_features_
)
"""Compute the indices of the categorical and continuous features."""
if self.categorical_features == "auto":
if not _is_pandas_df(X):
raise ValueError(
"When `categorical_features='auto'`, the input data "
f"should be a pandas.DataFrame. Got {type(X)} instead."
)
import pandas as pd # safely import pandas now

are_columns_categorical = np.array(
[isinstance(col_dtype, pd.CategoricalDtype) for col_dtype in X.dtypes]
)
self.categorical_features_ = np.flatnonzero(are_columns_categorical)
self.continuous_features_ = np.flatnonzero(~are_columns_categorical)
else:
self.categorical_features_ = np.array(
_get_column_indices(X, self.categorical_features)
)
self.continuous_features_ = np.setdiff1d(
np.arange(self.n_features_), self.categorical_features_
)

def _validate_estimator(self):
super()._validate_estimator()
Expand All @@ -589,6 +607,11 @@ def _validate_estimator(self):
"SMOTE-NC is not designed to work only with categorical "
"features. It requires some numerical features."
)
elif self.categorical_features_.size == 0:
raise ValueError(
"SMOTE-NC is not designed to work only with numerical "
"features. It requires some categorical features."
)

def _fit_resample(self, X, y):
# FIXME: to be removed in 0.12
Expand Down
49 changes: 49 additions & 0 deletions imblearn/over_sampling/_smote/tests/test_smote_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,3 +349,52 @@ def test_smotenc_categorical_features_str():
assert counter[0] == counter[1] == 70
assert_array_equal(smote.categorical_features_, [1, 2])
assert_array_equal(smote.continuous_features_, [0])


def test_smotenc_categorical_features_auto():
"""Check that we can automatically detect categorical features based on pandas
dataframe.
"""
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"B": ["a", "b"] * 5,
"C": ["a", "b", "c"] * 3 + ["a"],
}
)
X = pd.concat([X] * 10, ignore_index=True)
X["B"] = X["B"].astype("category")
X["C"] = X["C"].astype("category")
y = np.array([0] * 70 + [1] * 30)
smote = SMOTENC(categorical_features="auto", random_state=0)
X_res, y_res = smote.fit_resample(X, y)
assert X_res["B"].isin(["a", "b"]).all()
assert X_res["C"].isin(["a", "b", "c"]).all()
counter = Counter(y_res)
assert counter[0] == counter[1] == 70
assert_array_equal(smote.categorical_features_, [1, 2])
assert_array_equal(smote.continuous_features_, [0])


def test_smote_nc_categorical_features_auto_error():
"""Check that we raise a proper error when we cannot use the `'auto'` mode."""
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"B": ["a", "b"] * 5,
"C": ["a", "b", "c"] * 3 + ["a"],
}
)
y = np.array([0] * 70 + [1] * 30)
smote = SMOTENC(categorical_features="auto", random_state=0)

with pytest.raises(ValueError, match="the input data should be a pandas.DataFrame"):
smote.fit_resample(X.to_numpy(), y)

err_msg = "SMOTE-NC is not designed to work only with numerical features"
with pytest.raises(ValueError, match=err_msg):
smote.fit_resample(X, y)