Skip to content

EHN make some meta-estimators lenient towards missing values #17987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Aug 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,12 @@ Changelog
validity of the input is now delegated to the base estimator.
:pr:`17233` by :user:`Zolisa Bleki <zoj613>`.

- |Enhancement| :class:`multiclass.OneVsOneClassifier` now accepts
the inputs with missing values. Hence, estimators which can handle
missing values (may be a pipeline with imputation step) can be used as
a estimator for multiclass wrappers.
:pr:`17987` by :user:`Venkatachalam N <venkyyuvy>`.

:mod:`sklearn.multioutput`
..........................

Expand All @@ -369,6 +375,13 @@ Changelog
:pr:`18124` by :user:`Gus Brocchini <boldloop>` and
:user:`Amanda Dsouza <amy12xx>`.

- |Enhancement| :class:`multioutput.MultiOutputClassifier` and
:class:`multioutput.MultiOutputRegressor` now accepts the inputs
with missing values. Hence, estimators which can handle missing
values (may be a pipeline with imputation step, HistGradientBoosting
estimators) can be used as a estimator for multiclass wrappers.
:pr:`17987` by :user:`Venkatachalam N <venkyyuvy>`.

:mod:`sklearn.naive_bayes`
..........................

Expand Down
34 changes: 33 additions & 1 deletion sklearn/ensemble/tests/test_common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pytest

from sklearn.base import clone
Expand All @@ -6,14 +7,20 @@

from sklearn.datasets import make_classification
from sklearn.datasets import make_regression

from sklearn.datasets import load_iris, load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor

X, y = load_iris(return_X_y=True)

X_r, y_r = load_diabetes(return_X_y=True)


@pytest.mark.parametrize(
"X, y, estimator",
Expand Down Expand Up @@ -170,3 +177,28 @@ def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
estimator.set_params(lr='drop')
with pytest.raises(ValueError, match="All estimators are dropped."):
estimator.fit(X, y)


@pytest.mark.parametrize(
"Ensemble, Estimator, X, y",
[(StackingClassifier, LogisticRegression,
X, y),
(StackingRegressor, LinearRegression,
X_r, y_r),
(VotingClassifier, LogisticRegression,
X, y),
(VotingRegressor, LinearRegression,
X_r, y_r)]
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_heterogeneous_ensemble_support_missing_values(Ensemble,
Estimator, X, y):
# check that Voting and Stacking predictor delegate the missing values
# validation to the underlying estimator.
X = X.copy()
mask = np.random.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), Estimator())
ensemble = Ensemble(estimators=[('pipe1', pipe), ('pipe2', pipe)])
ensemble.fit(X, y).score(X, y)
12 changes: 7 additions & 5 deletions sklearn/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def partial_fit(self, X, y, classes=None):
if _check_partial_fit_first_call(self, classes):
if not hasattr(self.estimator, "partial_fit"):
raise ValueError(("Base estimator {0}, doesn't have "
"partial_fit method").format(self.estimator))
"partial_fit method").format(self.estimator))
self.estimators_ = [clone(self.estimator) for _ in range
(self.n_classes_)]

Expand All @@ -307,8 +307,8 @@ def partial_fit(self, X, y, classes=None):

if len(np.setdiff1d(y, self.classes_)):
raise ValueError(("Mini-batch contains {0} while classes " +
"must be subset of {1}").format(np.unique(y),
self.classes_))
"must be subset of {1}").format(np.unique(y),
self.classes_))

Y = self.label_binarizer_.transform(y)
Y = Y.tocsc()
Expand Down Expand Up @@ -578,7 +578,8 @@ def fit(self, X, y):
-------
self
"""
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
force_all_finite=False)
check_classification_targets(y)

self.classes_ = np.unique(y)
Expand Down Expand Up @@ -635,7 +636,8 @@ def partial_fit(self, X, y, classes=None):
"must be subset of {1}".format(np.unique(y),
self.classes_))

X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
force_all_finite=False)
check_classification_targets(y)
combinations = itertools.combinations(range(self.n_classes_), 2)
self.estimators_ = Parallel(
Expand Down
7 changes: 5 additions & 2 deletions sklearn/multioutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
self : object
"""
X, y = check_X_y(X, y,
force_all_finite=False,
multi_output=True,
accept_sparse=True)

Expand Down Expand Up @@ -153,7 +154,9 @@ def fit(self, X, y, sample_weight=None, **fit_params):
raise ValueError("The base estimator should implement"
" a fit method")

X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True)
X, y = self._validate_data(X, y,
force_all_finite=False,
multi_output=True, accept_sparse=True)

if is_classifier(self):
check_classification_targets(y)
Expand Down Expand Up @@ -196,7 +199,7 @@ def predict(self, X):
raise ValueError("The base estimator should implement"
" a predict method")

X = check_array(X, accept_sparse=True)
X = check_array(X, force_all_finite=False, accept_sparse=True)

y = Parallel(n_jobs=self.n_jobs)(
delayed(e.predict)(X)
Expand Down
21 changes: 20 additions & 1 deletion sklearn/tests/test_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
SGDClassifier)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn import datasets

Expand Down Expand Up @@ -776,3 +777,21 @@ def test_pairwise_cross_val_score():
score_precomputed = cross_val_score(ovr_true, linear_kernel, y)
score_linear = cross_val_score(ovr_false, X, y)
assert_array_equal(score_precomputed, score_linear)


@pytest.mark.parametrize("MultiClassClassifier",
[OneVsRestClassifier, OneVsOneClassifier])
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_support_missing_values(MultiClassClassifier):
# smoke test to check that pipeline OvR and OvO classifiers are letting
# the validation of missing values to
# the underlying pipeline or classifiers
rng = np.random.RandomState(42)
X, y = iris.data, iris.target
mask = rng.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
X[mask] = np.nan
lr = make_pipeline(SimpleImputer(),
LogisticRegression(random_state=rng))

MultiClassClassifier(lr).fit(X, y).score(X, y)
28 changes: 25 additions & 3 deletions sklearn/tests/test_multioutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer


def test_multi_target_regression():
Expand Down Expand Up @@ -302,7 +304,7 @@ def test_multiclass_multioutput_estimator():
multi_class_svc_ = clone(multi_class_svc) # create a clone
multi_class_svc_.fit(X, y[:, i])
assert (list(multi_class_svc_.predict(X)) ==
list(predictions[:, i]))
list(predictions[:, i]))


def test_multiclass_multioutput_estimator_predict_proba():
Expand Down Expand Up @@ -463,7 +465,7 @@ def test_classifier_chain_vs_independent_models():
Y_pred_chain = chain.predict(X_test)

assert (jaccard_score(Y_test, Y_pred_chain, average='samples') >
jaccard_score(Y_test, Y_pred_ovr, average='samples'))
jaccard_score(Y_test, Y_pred_ovr, average='samples'))


def test_base_chain_fit_and_predict():
Expand All @@ -476,7 +478,7 @@ def test_base_chain_fit_and_predict():
Y_pred = chain.predict(X)
assert Y_pred.shape == Y.shape
assert ([c.coef_.size for c in chain.estimators_] ==
list(range(X.shape[1], X.shape[1] + Y.shape[1])))
list(range(X.shape[1], X.shape[1] + Y.shape[1])))

Y_prob = chains[1].predict_proba(X)
Y_binary = (Y_prob >= .5)
Expand Down Expand Up @@ -603,6 +605,26 @@ def fit(self, X, y, **fit_params):
assert est.sample_weight_ is weight


@pytest.mark.parametrize(
'MultiOutputEstimator, Estimator',
[(MultiOutputClassifier, LogisticRegression),
(MultiOutputRegressor, Ridge)]
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_support_missing_values(MultiOutputEstimator, Estimator):
# smoke test to check that pipeline MultioutputEstimators are letting
# the validation of missing values to
# the underlying pipeline, regressor or classifier
rng = np.random.RandomState(42)
X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
mask = rng.choice([1, 0], X.shape, p=[.01, .99]).astype(bool)
X[mask] = np.nan

pipe = make_pipeline(SimpleImputer(), Estimator())
MultiOutputEstimator(pipe).fit(X, y).score(X, y)


@pytest.mark.parametrize("order_type", [list, np.array, tuple])
def test_classifier_chain_tuple_order(order_type):
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
Expand Down
11 changes: 11 additions & 0 deletions sklearn/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer

iris = load_iris()

Expand Down Expand Up @@ -1222,6 +1223,16 @@ def transform(self, X, y=None):
t.fit_transform(X, y, a=0)


def test_pipeline_missing_values_leniency():
# check that pipeline let the missing values validation to
# the underlying transformers and predictors.
X, y = iris.data, iris.target
mask = np.random.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), LogisticRegression())
assert pipe.fit(X, y).score(X, y) > 0.4


def test_feature_union_warns_unknown_transformer_weight():
# Warn user when transformer_weights containers a key not present in
# transformer_list
Expand Down