Skip to content

FIX compute the median of std dev for each class to over-sample in SMOTENC #1015

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/over_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,9 @@ or relying on `dtype` inference if the columns are using the
>>> print(sorted(Counter(y_resampled).items()))
[(0, 30), (1, 30)]
>>> print(X_resampled[-5:])
[['A' 0.52... 2]
[['A' 0.19... 2]
['B' -0.36... 2]
['B' 0.93... 2]
['B' 0.87... 2]
['B' 0.37... 2]
['B' 0.33... 2]]

Expand Down
9 changes: 9 additions & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ Bug fixes
they are plugged into an Euclidean distance computation.
:pr:`1014` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the median of standard
deviation of the continuous features was only computed on the minority class. Now,
we are computing this statistic for each class that is up-sampled.
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` such that the case where
the median of standard deviation of the continuous features is null is handled
in the multiclass case as well.
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.

Version 0.11.0
==============
Expand Down
97 changes: 55 additions & 42 deletions imblearn/over_sampling/_smote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import math
import numbers
import warnings
from collections import Counter

import numpy as np
from scipy import sparse
Expand All @@ -23,7 +22,6 @@
check_random_state,
)
from sklearn.utils.sparsefuncs_fast import (
csc_mean_variance_axis0,
csr_mean_variance_axis0,
)
from sklearn.utils.validation import _num_features
Expand Down Expand Up @@ -116,11 +114,11 @@ def _make_samples(
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])

X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps)
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
return X_new, y_new

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
r"""Generate a synthetic sample.

The rule for the generation is:
Expand Down Expand Up @@ -155,6 +153,9 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
steps : ndarray of shape (n_samples,), dtype=float
Step sizes for new samples.

y_type : None
Unused parameter. Only for compatibility reason with SMOTE-NC.

Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Expand Down Expand Up @@ -465,8 +466,9 @@ class SMOTENC(SMOTE):
continuous_features_ : ndarray of shape (n_cont_features,), dtype=np.int64
Indices of the continuous features.

median_std_ : float
Median of the standard deviation of the continuous features.
median_std_ : dict of int -> float
Median of the standard deviation of the continuous features for each
class to be over-sampled.

n_features_ : int
Number of features observed at `fit`.
Expand Down Expand Up @@ -627,23 +629,8 @@ def _fit_resample(self, X, y):
self._validate_column_types(X)
self._validate_estimator()

# compute the median of the standard deviation of the minority class
target_stats = Counter(y)
class_minority = min(target_stats, key=target_stats.get)

X_continuous = _safe_indexing(X, self.continuous_features_, axis=1)
X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority))

if sparse.issparse(X):
if X.format == "csr":
_, var = csr_mean_variance_axis0(X_minority)
else:
_, var = csc_mean_variance_axis0(X_minority)
else:
var = X_minority.var(axis=0)
self.median_std_ = np.median(np.sqrt(var))

X_categorical = _safe_indexing(X, self.categorical_features_, axis=1)
if X_continuous.dtype.name != "object":
dtype_ohe = X_continuous.dtype
Expand All @@ -664,28 +651,54 @@ def _fit_resample(self, X, y):
if not sparse.issparse(X_ohe):
X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe)

# we can replace the 1 entries of the categorical features with the
# median of the standard deviation. It will ensure that whenever
# distance is computed between 2 samples, the difference will be equal
# to the median of the standard deviation as in the original paper.

# In the edge case where the median of the std is equal to 0, the 1s
# entries will be also nullified. In this case, we store the original
# categorical encoding which will be later used for inverting the OHE
if math.isclose(self.median_std_, 0):
self._X_categorical_minority_encoded = _safe_indexing(
X_ohe.toarray(), np.flatnonzero(y == class_minority)
X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr", dtype=dtype_ohe)
X_resampled = [X_encoded.copy()]
y_resampled = [y.copy()]

# SMOTE resampling starts here
self.median_std_ = {}
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = _safe_indexing(X_encoded, target_class_indices)

_, var = csr_mean_variance_axis0(
X_class[:, : self.continuous_features_.size]
)
self.median_std_[class_sample] = np.median(np.sqrt(var))

# In the edge case where the median of the std is equal to 0, the 1s
# entries will be also nullified. In this case, we store the original
# categorical encoding which will be later used for inverting the OHE
if math.isclose(self.median_std_[class_sample], 0):
# This variable will be used when generating data
self._X_categorical_minority_encoded = X_class[
:, self.continuous_features_.size :
].toarray()

# we can replace the 1 entries of the categorical features with the
# median of the standard deviation. It will ensure that whenever
# distance is computed between 2 samples, the difference will be equal
# to the median of the standard deviation as in the original paper.
X_class_categorical = X_class[:, self.continuous_features_.size :]
# With one-hot encoding, the median will be repeated twice. We need
# to divide by sqrt(2) such that we only have one median value
# contributing to the Euclidean distance
X_class_categorical.data[:] = self.median_std_[class_sample] / np.sqrt(2)
X_class[:, self.continuous_features_.size :] = X_class_categorical

# With one-hot encoding, the median will be repeated twice. We need to divide
# by sqrt(2) such that we only have one median value contributing to the
# Euclidean distance
X_ohe.data = (
np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / np.sqrt(2)
)
X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")
self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(
X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0
)
X_resampled.append(X_new)
y_resampled.append(y_new)

X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
X_resampled = sparse.vstack(X_resampled, format=X_encoded.format)
y_resampled = np.hstack(y_resampled)
# SMOTE resampling ends here

# reverse the encoding of the categorical features
X_res_cat = X_resampled[:, self.continuous_features_.size :]
Expand Down Expand Up @@ -723,7 +736,7 @@ def _fit_resample(self, X, y):

return X_resampled, y_resampled

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
"""Generate a synthetic sample with an additional steps for the
categorical features.

Expand All @@ -741,7 +754,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):

# In the case that the median std was equal to zeros, we have to
# create non-null entry based on the encoded of OHE
if math.isclose(self.median_std_, 0):
if math.isclose(self.median_std_[y_type], 0):
nn_data[
:, self.continuous_features_.size :
] = self._X_categorical_minority_encoded
Expand Down
23 changes: 21 additions & 2 deletions imblearn/over_sampling/_smote/tests/test_smote_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def test_smotenc(data):
assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype

assert isinstance(smote.median_std_, dict)


# part of the common test which apply to SMOTE-NC even if it is not default
# constructible
Expand Down Expand Up @@ -193,6 +195,7 @@ def test_smotenc_pandas():
X_res, y_res = smote.fit_resample(X, y)
assert_array_equal(X_res_pd.to_numpy(), X_res)
assert_allclose(y_res_pd, y_res)
assert set(smote.median_std_.keys()) == {0, 1}


def test_smotenc_preserve_dtype():
Expand Down Expand Up @@ -234,20 +237,36 @@ def test_smote_nc_with_null_median_std():
[
[1, 2, 1, "A"],
[2, 1, 2, "A"],
[2, 1, 2, "A"],
[1, 2, 3, "B"],
[1, 2, 4, "C"],
[1, 2, 5, "C"],
[1, 2, 4, "C"],
[1, 2, 4, "C"],
[1, 2, 4, "C"],
],
dtype="object",
)
labels = np.array(
["class_1", "class_1", "class_1", "class_2", "class_2"], dtype=object
[
"class_1",
"class_1",
"class_1",
"class_1",
"class_2",
"class_2",
"class_3",
"class_3",
"class_3",
],
dtype=object,
)
smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0)
X_res, y_res = smote.fit_resample(data, labels)
# check that the categorical feature is not random but correspond to the
# categories seen in the minority class samples
assert X_res[-1, -1] == "C"
assert_array_equal(X_res[-3:, -1], np.array(["C", "C", "C"], dtype=object))
assert smote.median_std_ == {"class_2": 0.0, "class_3": 0.0}


def test_smotenc_categorical_encoder():
Expand Down