diff --git a/doc/conf.py b/doc/conf.py index 27a6bf2ee30c2..c736adc8e267e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -263,9 +263,9 @@ 'sphx_glr_plot_compare_methods_001.png': 349} -# enable experimental module so that the new GBDTs estimators can be +# enable experimental module so that experimental estimators can be # discovered properly by sphinx -from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.experimental import * # noqa def make_carousel_thumbs(app, exception): diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index d61b519a9171e..5f7bb2026021c 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -471,6 +471,7 @@ Samples generator :toctree: generated/ experimental.enable_hist_gradient_boosting + experimental.enable_iterative_imputer .. _feature_extraction_ref: diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 777a2bd157b29..4cd0ea6e85d60 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -105,7 +105,16 @@ of ``y``. This is done for each feature in an iterative fashion, and then is repeated for ``max_iter`` imputation rounds. The results of the final imputation round are returned. +.. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_iterative_imputer``. + +:: + >>> import numpy as np + >>> from sklearn.experimental import enable_iterative_imputer >>> from sklearn.impute import IterativeImputer >>> imp = IterativeImputer(max_iter=10, random_state=0) >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 299ed158797af..73ebaacfd44a0 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -216,7 +216,7 @@ Support for Python 3.4 and below has been officially dropped. >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble + >>> # now you can import normally from sklearn.ensemble >>> from sklearn.ensemble import HistGradientBoostingClassifier :pr:`12807` by :user:`Nicolas Hug`. @@ -358,6 +358,15 @@ Support for Python 3.4 and below has been officially dropped. :pr:`12177` by :user:`Sergey Feldman ` and :user:`Ben Lawson `. + The API of IterativeImputer is experimental and subject to change without any + deprecation cycle. To use them, you need to explicitly import + ``enable_iterative_imputer``:: + + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from sklearn.impute + >>> from sklearn.impute import IterativeImputer + + - |Feature| The :class:`impute.SimpleImputer` and :class:`impute.IterativeImputer` have a new parameter ``'add_indicator'``, which simply stacks a :class:`impute.MissingIndicator` transform into the diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 382e8de304be8..06fab08c381f2 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -42,6 +42,8 @@ import matplotlib.pyplot as plt import pandas as pd +# To use this experimental feature, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer from sklearn.impute import IterativeImputer diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 897b66aad246c..2d2d37745abf3 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -23,6 +23,8 @@ import numpy as np import matplotlib.pyplot as plt +# To use the experimental IterativeImputer, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import load_diabetes from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py new file mode 100644 index 0000000000000..2f262141cc069 --- /dev/null +++ b/sklearn/experimental/enable_iterative_imputer.py @@ -0,0 +1,19 @@ +"""Enables IterativeImputer + +The API and results of this estimators might change without any deprecation +cycle. + +Importing this file dynamically sets :class:`sklearn.impute.IterativeImputer` +as an attribute of the impute module:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from impute + >>> from sklearn.impute import IterativeImputer +""" + +from ..impute._iterative import IterativeImputer +from .. import impute + +impute.IterativeImputer = IterativeImputer +impute.__all__ += ['IterativeImputer'] diff --git a/sklearn/experimental/tests/test_enable_iterative_imputer.py b/sklearn/experimental/tests/test_enable_iterative_imputer.py new file mode 100644 index 0000000000000..17579e0c43612 --- /dev/null +++ b/sklearn/experimental/tests/test_enable_iterative_imputer.py @@ -0,0 +1,39 @@ +"""Tests for making sure experimental imports work as expected.""" + +import textwrap + +from sklearn.utils.testing import assert_run_python_script + + +def test_imports_strategies(): + # Make sure different import strategies work or fail as expected. + + # Since Python caches the imported modules, we need to run a child process + # for every test case. Else, the tests would not be independent + # (manually removing the imports from the cache (sys.modules) is not + # recommended and can lead to many complications). + + good_import = """ + from sklearn.experimental import enable_iterative_imputer + from sklearn.impute import IterativeImputer + """ + assert_run_python_script(textwrap.dedent(good_import)) + + good_import_with_ensemble_first = """ + import sklearn.ensemble + from sklearn.experimental import enable_iterative_imputer + from sklearn.impute import IterativeImputer + """ + assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first)) + + bad_imports = """ + import pytest + + with pytest.raises(ImportError): + from sklearn.impute import IterativeImputer + + import sklearn.experimental + with pytest.raises(ImportError): + from sklearn.impute import IterativeImputer + """ + assert_run_python_script(textwrap.dedent(bad_imports)) diff --git a/sklearn/impute.py b/sklearn/impute.py deleted file mode 100644 index 8bbf1bb94e242..0000000000000 --- a/sklearn/impute.py +++ /dev/null @@ -1,1339 +0,0 @@ -"""Transformers for missing value imputation""" -# Authors: Nicolas Tresegnie -# Sergey Feldman -# License: BSD 3 clause - -from __future__ import division - -import warnings -import numbers -from time import time -from distutils.version import LooseVersion - -import numpy as np -import numpy.ma as ma -import scipy -from scipy import sparse -from scipy import stats -from collections import namedtuple - -from .base import BaseEstimator, TransformerMixin -from .base import clone -from .exceptions import ConvergenceWarning -from .preprocessing import normalize -from .utils import check_array, check_random_state, safe_indexing -from .utils.sparsefuncs import _get_median -from .utils.validation import check_is_fitted -from .utils.validation import FLOAT_DTYPES -from .utils.fixes import _object_dtype_isnan -from .utils import is_scalar_nan - - -ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', - 'neighbor_feat_idx', - 'estimator']) - -__all__ = [ - 'MissingIndicator', - 'SimpleImputer', - 'IterativeImputer', -] - - -def _check_inputs_dtype(X, missing_values): - if (X.dtype.kind in ("f", "i", "u") and - not isinstance(missing_values, numbers.Real)): - raise ValueError("'X' and 'missing_values' types are expected to be" - " both numerical. Got X.dtype={} and " - " type(missing_values)={}." - .format(X.dtype, type(missing_values))) - - -def _get_mask(X, value_to_mask): - """Compute the boolean mask X == missing_values.""" - if is_scalar_nan(value_to_mask): - if X.dtype.kind == "f": - return np.isnan(X) - elif X.dtype.kind in ("i", "u"): - # can't have NaNs in integer array. - return np.zeros(X.shape, dtype=bool) - else: - # np.isnan does not work on object dtypes. - return _object_dtype_isnan(X) - else: - # X == value_to_mask with object dytpes does not always perform - # element-wise for old versions of numpy - return np.equal(X, value_to_mask) - - -def _most_frequent(array, extra_value, n_repeat): - """Compute the most frequent value in a 1d array extended with - [extra_value] * n_repeat, where extra_value is assumed to be not part - of the array.""" - # Compute the most frequent value in array only - if array.size > 0: - with warnings.catch_warnings(): - # stats.mode raises a warning when input array contains objects due - # to incapacity to detect NaNs. Irrelevant here since input array - # has already been NaN-masked. - warnings.simplefilter("ignore", RuntimeWarning) - mode = stats.mode(array) - - most_frequent_value = mode[0][0] - most_frequent_count = mode[1][0] - else: - most_frequent_value = 0 - most_frequent_count = 0 - - # Compare to array + [extra_value] * n_repeat - if most_frequent_count == 0 and n_repeat == 0: - return np.nan - elif most_frequent_count < n_repeat: - return extra_value - elif most_frequent_count > n_repeat: - return most_frequent_value - elif most_frequent_count == n_repeat: - # Ties the breaks. Copy the behaviour of scipy.stats.mode - if most_frequent_value < extra_value: - return most_frequent_value - else: - return extra_value - - -class SimpleImputer(BaseEstimator, TransformerMixin): - """Imputation transformer for completing missing values. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - missing_values : number, string, np.nan (default) or None - The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. - - strategy : string, optional (default="mean") - The imputation strategy. - - - If "mean", then replace missing values using the mean along - each column. Can only be used with numeric data. - - If "median", then replace missing values using the median along - each column. Can only be used with numeric data. - - If "most_frequent", then replace missing using the most frequent - value along each column. Can be used with strings or numeric data. - - If "constant", then replace missing values with fill_value. Can be - used with strings or numeric data. - - .. versionadded:: 0.20 - strategy="constant" for fixed value imputation. - - fill_value : string or numerical value, optional (default=None) - When strategy == "constant", fill_value is used to replace all - occurrences of missing_values. - If left to the default, fill_value will be 0 when imputing numerical - data and "missing_value" for strings or object data types. - - verbose : integer, optional (default=0) - Controls the verbosity of the imputer. - - copy : boolean, optional (default=True) - If True, a copy of X will be created. If False, imputation will - be done in-place whenever possible. Note that, in the following cases, - a new copy will always be made, even if `copy=False`: - - - If X is not an array of floating values; - - If X is encoded as a CSR matrix; - - If add_indicator=True. - - add_indicator : boolean, optional (default=False) - If True, a `MissingIndicator` transform will stack onto output - of the imputer's transform. This allows a predictive estimator - to account for missingness despite imputation. If a feature has no - missing values at fit/train time, the feature won't appear on - the missing indicator even if there are missing values at - transform/test time. - - Attributes - ---------- - statistics_ : array of shape (n_features,) - The imputation fill value for each feature. - - indicator_ : :class:`sklearn.impute.MissingIndicator` - Indicator used to add binary indicators for missing values. - ``None`` if add_indicator is False. - - See also - -------- - IterativeImputer : Multivariate imputation of missing values. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.impute import SimpleImputer - >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') - >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) - ... # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(add_indicator=False, copy=True, fill_value=None, - missing_values=nan, strategy='mean', verbose=0) - >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] - >>> print(imp_mean.transform(X)) - ... # doctest: +NORMALIZE_WHITESPACE - [[ 7. 2. 3. ] - [ 4. 3.5 6. ] - [10. 3.5 9. ]] - - Notes - ----- - Columns which only contained missing values at `fit` are discarded upon - `transform` if strategy is not "constant". - - """ - def __init__(self, missing_values=np.nan, strategy="mean", - fill_value=None, verbose=0, copy=True, add_indicator=False): - self.missing_values = missing_values - self.strategy = strategy - self.fill_value = fill_value - self.verbose = verbose - self.copy = copy - self.add_indicator = add_indicator - - def _validate_input(self, X): - allowed_strategies = ["mean", "median", "most_frequent", "constant"] - if self.strategy not in allowed_strategies: - raise ValueError("Can only use these strategies: {0} " - " got strategy={1}".format(allowed_strategies, - self.strategy)) - - if self.strategy in ("most_frequent", "constant"): - dtype = None - else: - dtype = FLOAT_DTYPES - - if not is_scalar_nan(self.missing_values): - force_all_finite = True - else: - force_all_finite = "allow-nan" - - try: - X = check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, copy=self.copy) - except ValueError as ve: - if "could not convert" in str(ve): - raise ValueError("Cannot use {0} strategy with non-numeric " - "data. Received datatype :{1}." - "".format(self.strategy, X.dtype.kind)) - else: - raise ve - - _check_inputs_dtype(X, self.missing_values) - if X.dtype.kind not in ("i", "u", "f", "O"): - raise ValueError("SimpleImputer does not support data with dtype " - "{0}. Please provide either a numeric array (with" - " a floating point or integer dtype) or " - "categorical data represented either as an array " - "with integer dtype or an array of string values " - "with an object dtype.".format(X.dtype)) - - return X - - def fit(self, X, y=None): - """Fit the imputer on X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : SimpleImputer - """ - X = self._validate_input(X) - - # default fill_value is 0 for numerical input and "missing_value" - # otherwise - if self.fill_value is None: - if X.dtype.kind in ("i", "u", "f"): - fill_value = 0 - else: - fill_value = "missing_value" - else: - fill_value = self.fill_value - - # fill_value should be numerical in case of numerical input - if (self.strategy == "constant" and - X.dtype.kind in ("i", "u", "f") and - not isinstance(fill_value, numbers.Real)): - raise ValueError("'fill_value'={0} is invalid. Expected a " - "numerical value when imputing numerical " - "data".format(fill_value)) - - if sparse.issparse(X): - # missing_values = 0 not allowed with sparse data as it would - # force densification - if self.missing_values == 0: - raise ValueError("Imputation not possible when missing_values " - "== 0 and input is sparse. Provide a dense " - "array instead.") - else: - self.statistics_ = self._sparse_fit(X, - self.strategy, - self.missing_values, - fill_value) - else: - self.statistics_ = self._dense_fit(X, - self.strategy, - self.missing_values, - fill_value) - - if self.add_indicator: - self.indicator_ = MissingIndicator( - missing_values=self.missing_values) - self.indicator_.fit(X) - else: - self.indicator_ = None - - return self - - def _sparse_fit(self, X, strategy, missing_values, fill_value): - """Fit the transformer on sparse data.""" - mask_data = _get_mask(X.data, missing_values) - n_implicit_zeros = X.shape[0] - np.diff(X.indptr) - - statistics = np.empty(X.shape[1]) - - if strategy == "constant": - # for constant strategy, self.statistcs_ is used to store - # fill_value in each column - statistics.fill(fill_value) - else: - for i in range(X.shape[1]): - column = X.data[X.indptr[i]:X.indptr[i + 1]] - mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] - column = column[~mask_column] - - # combine explicit and implicit zeros - mask_zeros = _get_mask(column, 0) - column = column[~mask_zeros] - n_explicit_zeros = mask_zeros.sum() - n_zeros = n_implicit_zeros[i] + n_explicit_zeros - - if strategy == "mean": - s = column.size + n_zeros - statistics[i] = np.nan if s == 0 else column.sum() / s - - elif strategy == "median": - statistics[i] = _get_median(column, - n_zeros) - - elif strategy == "most_frequent": - statistics[i] = _most_frequent(column, - 0, - n_zeros) - return statistics - - def _dense_fit(self, X, strategy, missing_values, fill_value): - """Fit the transformer on dense data.""" - mask = _get_mask(X, missing_values) - masked_X = ma.masked_array(X, mask=mask) - - # Mean - if strategy == "mean": - mean_masked = np.ma.mean(masked_X, axis=0) - # Avoid the warning "Warning: converting a masked element to nan." - mean = np.ma.getdata(mean_masked) - mean[np.ma.getmask(mean_masked)] = np.nan - - return mean - - # Median - elif strategy == "median": - median_masked = np.ma.median(masked_X, axis=0) - # Avoid the warning "Warning: converting a masked element to nan." - median = np.ma.getdata(median_masked) - median[np.ma.getmaskarray(median_masked)] = np.nan - - return median - - # Most frequent - elif strategy == "most_frequent": - # scipy.stats.mstats.mode cannot be used because it will no work - # properly if the first element is masked and if its frequency - # is equal to the frequency of the most frequent valid element - # See https://github.com/scipy/scipy/issues/2636 - - # To be able access the elements by columns - X = X.transpose() - mask = mask.transpose() - - if X.dtype.kind == "O": - most_frequent = np.empty(X.shape[0], dtype=object) - else: - most_frequent = np.empty(X.shape[0]) - - for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): - row_mask = np.logical_not(row_mask).astype(np.bool) - row = row[row_mask] - most_frequent[i] = _most_frequent(row, np.nan, 0) - - return most_frequent - - # Constant - elif strategy == "constant": - # for constant strategy, self.statistcs_ is used to store - # fill_value in each column - return np.full(X.shape[1], fill_value, dtype=X.dtype) - - def transform(self, X): - """Impute all missing values in X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The input data to complete. - """ - check_is_fitted(self, 'statistics_') - - X = self._validate_input(X) - - statistics = self.statistics_ - - if X.shape[1] != statistics.shape[0]: - raise ValueError("X has %d features per sample, expected %d" - % (X.shape[1], self.statistics_.shape[0])) - - if self.add_indicator: - X_trans_indicator = self.indicator_.transform(X) - - # Delete the invalid columns if strategy is not constant - if self.strategy == "constant": - valid_statistics = statistics - else: - # same as np.isnan but also works for object dtypes - invalid_mask = _get_mask(statistics, np.nan) - valid_mask = np.logical_not(invalid_mask) - valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.flatnonzero(valid_mask) - - if invalid_mask.any(): - missing = np.arange(X.shape[1])[invalid_mask] - if self.verbose: - warnings.warn("Deleting features without " - "observed values: %s" % missing) - X = X[:, valid_statistics_indexes] - - # Do actual imputation - if sparse.issparse(X): - if self.missing_values == 0: - raise ValueError("Imputation not possible when missing_values " - "== 0 and input is sparse. Provide a dense " - "array instead.") - else: - mask = _get_mask(X.data, self.missing_values) - indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), - np.diff(X.indptr))[mask] - - X.data[mask] = valid_statistics[indexes].astype(X.dtype, - copy=False) - else: - mask = _get_mask(X, self.missing_values) - n_missing = np.sum(mask, axis=0) - values = np.repeat(valid_statistics, n_missing) - coordinates = np.where(mask.transpose())[::-1] - - X[coordinates] = values - - if self.add_indicator: - hstack = sparse.hstack if sparse.issparse(X) else np.hstack - X = hstack((X, X_trans_indicator)) - - return X - - def _more_tags(self): - return {'allow_nan': True} - - -class IterativeImputer(BaseEstimator, TransformerMixin): - """Multivariate imputer that estimates each feature from all the others. - - A strategy for imputing missing values by modeling each feature with - missing values as a function of other features in a round-robin fashion. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object, default=BayesianRidge() - The estimator to use at each step of the round-robin imputation. - If ``sample_posterior`` is True, the estimator must support - ``return_std`` in its ``predict`` method. - - missing_values : int, np.nan, optional (default=np.nan) - The placeholder for the missing values. All occurrences of - ``missing_values`` will be imputed. - - sample_posterior : boolean, default=False - Whether to sample from the (Gaussian) predictive posterior of the - fitted estimator for each imputation. Estimator must support - ``return_std`` in its ``predict`` method if set to ``True``. Set to - ``True`` if using ``IterativeImputer`` for multiple imputations. - - max_iter : int, optional (default=10) - Maximum number of imputation rounds to perform before returning the - imputations computed during the final round. A round is a single - imputation of each feature with missing values. The stopping criterion - is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol, - where `X_t` is `X` at iteration `t. Note that early stopping is only - applied if ``sample_posterior=False``. - - tol : float, optional (default=1e-3) - Tolerance of the stopping condition. - - n_nearest_features : int, optional (default=None) - Number of other features to use to estimate the missing values of - each feature column. Nearness between features is measured using - the absolute correlation coefficient between each feature pair (after - initial imputation). To ensure coverage of features throughout the - imputation process, the neighbor features are not necessarily nearest, - but are drawn with probability proportional to correlation for each - imputed target feature. Can provide significant speed-up when the - number of features is huge. If ``None``, all features will be used. - - initial_strategy : str, optional (default="mean") - Which strategy to use to initialize the missing values. Same as the - ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` - Valid values: {"mean", "median", "most_frequent", or "constant"}. - - imputation_order : str, optional (default="ascending") - The order in which the features will be imputed. Possible values: - - "ascending" - From features with fewest missing values to most. - "descending" - From features with most missing values to fewest. - "roman" - Left to right. - "arabic" - Right to left. - "random" - A random order for each round. - - min_value : float, optional (default=None) - Minimum possible imputed value. Default of ``None`` will set minimum - to negative infinity. - - max_value : float, optional (default=None) - Maximum possible imputed value. Default of ``None`` will set maximum - to positive infinity. - - verbose : int, optional (default=0) - Verbosity flag, controls the debug messages that are issued - as functions are evaluated. The higher, the more verbose. Can be 0, 1, - or 2. - - random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use. Randomizes - selection of estimator features if n_nearest_features is not None, the - ``imputation_order`` if ``random``, and the sampling from posterior if - ``sample_posterior`` is True. Use an integer for determinism. - See :term:`the Glossary `. - - add_indicator : boolean, optional (default=False) - If True, a `MissingIndicator` transform will stack onto output - of the imputer's transform. This allows a predictive estimator - to account for missingness despite imputation. If a feature has no - missing values at fit/train time, the feature won't appear on - the missing indicator even if there are missing values at - transform/test time. - - Attributes - ---------- - initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` - Imputer used to initialize the missing values. - - imputation_sequence_ : list of tuples - Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where - ``feat_idx`` is the current feature to be imputed, - ``neighbor_feat_idx`` is the array of other features used to impute the - current feature, and ``estimator`` is the trained estimator used for - the imputation. Length is ``self.n_features_with_missing_ * - self.n_iter_``. - - n_iter_ : int - Number of iteration rounds that occurred. Will be less than - ``self.max_iter`` if early stopping criterion was reached. - - n_features_with_missing_ : int - Number of features with missing values. - - indicator_ : :class:`sklearn.impute.MissingIndicator` - Indicator used to add binary indicators for missing values. - ``None`` if add_indicator is False. - - See also - -------- - SimpleImputer : Univariate imputation of missing values. - - Notes - ----- - To support imputation in inductive mode we store each feature's estimator - during the ``fit`` phase, and predict without refitting (in order) during - the ``transform`` phase. - - Features which contain all missing values at ``fit`` are discarded upon - ``transform``. - - Features with missing values during ``transform`` which did not have any - missing values during ``fit`` will be imputed with the initial imputation - method only. - - References - ---------- - .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: - Multivariate Imputation by Chained Equations in R". Journal of - Statistical Software 45: 1-67. - `_ - - .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in - Multivariate Data Suitable for use with an Electronic Computer". - Journal of the Royal Statistical Society 22(2): 302-306. - `_ - """ - - def __init__(self, - estimator=None, - missing_values=np.nan, - sample_posterior=False, - max_iter=10, - tol=1e-3, - n_nearest_features=None, - initial_strategy="mean", - imputation_order='ascending', - min_value=None, - max_value=None, - verbose=0, - random_state=None, - add_indicator=False): - - self.estimator = estimator - self.missing_values = missing_values - self.sample_posterior = sample_posterior - self.max_iter = max_iter - self.tol = tol - self.n_nearest_features = n_nearest_features - self.initial_strategy = initial_strategy - self.imputation_order = imputation_order - self.min_value = min_value - self.max_value = max_value - self.verbose = verbose - self.random_state = random_state - self.add_indicator = add_indicator - - def _impute_one_feature(self, - X_filled, - mask_missing_values, - feat_idx, - neighbor_feat_idx, - estimator=None, - fit_mode=True): - """Impute a single feature from the others provided. - - This function predicts the missing values of one of the features using - the current estimates of all the other features. The ``estimator`` must - support ``return_std=True`` in its ``predict`` method for this function - to work. - - Parameters - ---------- - X_filled : ndarray - Input data with the most recent imputations. - - mask_missing_values : ndarray - Input data's missing indicator matrix. - - feat_idx : int - Index of the feature currently being imputed. - - neighbor_feat_idx : ndarray - Indices of the features to be used in imputing ``feat_idx``. - - estimator : object - The estimator to use at this step of the round-robin imputation. - If ``sample_posterior`` is True, the estimator must support - ``return_std`` in its ``predict`` method. - If None, it will be cloned from self._estimator. - - fit_mode : boolean, default=True - Whether to fit and predict with the estimator or just predict. - - Returns - ------- - X_filled : ndarray - Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. - - estimator : estimator with sklearn API - The fitted estimator used to impute - ``X_filled[missing_row_mask, feat_idx]``. - """ - - # if nothing is missing, just return the default - # (should not happen at fit time because feat_ids would be excluded) - missing_row_mask = mask_missing_values[:, feat_idx] - if not np.any(missing_row_mask): - return X_filled, estimator - - if estimator is None and fit_mode is False: - raise ValueError("If fit_mode is False, then an already-fitted " - "estimator should be passed in.") - - if estimator is None: - estimator = clone(self._estimator) - - if fit_mode: - X_train = safe_indexing(X_filled[:, neighbor_feat_idx], - ~missing_row_mask) - y_train = safe_indexing(X_filled[:, feat_idx], - ~missing_row_mask) - estimator.fit(X_train, y_train) - - # get posterior samples - X_test = safe_indexing(X_filled[:, neighbor_feat_idx], - missing_row_mask) - if self.sample_posterior: - mus, sigmas = estimator.predict(X_test, return_std=True) - imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - # two types of problems: (1) non-positive sigmas, (2) mus outside - # legal range of min_value and max_value (results in inf sample) - positive_sigmas = sigmas > 0 - imputed_values[~positive_sigmas] = mus[~positive_sigmas] - mus_too_low = mus < self._min_value - imputed_values[mus_too_low] = self._min_value - mus_too_high = mus > self._max_value - imputed_values[mus_too_high] = self._max_value - # the rest can be sampled without statistical issues - inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high - mus = mus[inrange_mask] - sigmas = sigmas[inrange_mask] - a = (self._min_value - mus) / sigmas - b = (self._max_value - mus) / sigmas - - if scipy.__version__ < LooseVersion('0.18'): - # bug with vector-valued `a` in old scipy - imputed_values[inrange_mask] = [ - stats.truncnorm(a=a_, b=b_, - loc=loc_, scale=scale_).rvs( - random_state=self.random_state_) - for a_, b_, loc_, scale_ - in zip(a, b, mus, sigmas)] - else: - truncated_normal = stats.truncnorm(a=a, b=b, - loc=mus, scale=sigmas) - imputed_values[inrange_mask] = truncated_normal.rvs( - random_state=self.random_state_) - else: - imputed_values = estimator.predict(X_test) - imputed_values = np.clip(imputed_values, - self._min_value, - self._max_value) - - # update the feature - X_filled[missing_row_mask, feat_idx] = imputed_values - return X_filled, estimator - - def _get_neighbor_feat_idx(self, - n_features, - feat_idx, - abs_corr_mat): - """Get a list of other features to predict ``feat_idx``. - - If self.n_nearest_features is less than or equal to the total - number of features, then use a probability proportional to the absolute - correlation between ``feat_idx`` and each other feature to randomly - choose a subsample of the other features (without replacement). - - Parameters - ---------- - n_features : int - Number of features in ``X``. - - feat_idx : int - Index of the feature currently being imputed. - - abs_corr_mat : ndarray, shape (n_features, n_features) - Absolute correlation matrix of ``X``. The diagonal has been zeroed - out and each feature has been normalized to sum to 1. Can be None. - - Returns - ------- - neighbor_feat_idx : array-like - The features to use to impute ``feat_idx``. - """ - if (self.n_nearest_features is not None and - self.n_nearest_features < n_features): - p = abs_corr_mat[:, feat_idx] - neighbor_feat_idx = self.random_state_.choice( - np.arange(n_features), self.n_nearest_features, replace=False, - p=p) - else: - inds_left = np.arange(feat_idx) - inds_right = np.arange(feat_idx + 1, n_features) - neighbor_feat_idx = np.concatenate((inds_left, inds_right)) - return neighbor_feat_idx - - def _get_ordered_idx(self, mask_missing_values): - """Decide in what order we will update the features. - - As a homage to the MICE R package, we will have 4 main options of - how to order the updates, and use a random order if anything else - is specified. - - Also, this function skips features which have no missing values. - - Parameters - ---------- - mask_missing_values : array-like, shape (n_samples, n_features) - Input data's missing indicator matrix, where "n_samples" is the - number of samples and "n_features" is the number of features. - - Returns - ------- - ordered_idx : ndarray, shape (n_features,) - The order in which to impute the features. - """ - frac_of_missing_values = mask_missing_values.mean(axis=0) - missing_values_idx = np.nonzero(frac_of_missing_values)[0] - if self.imputation_order == 'roman': - ordered_idx = missing_values_idx - elif self.imputation_order == 'arabic': - ordered_idx = missing_values_idx[::-1] - elif self.imputation_order == 'ascending': - n = len(frac_of_missing_values) - len(missing_values_idx) - ordered_idx = np.argsort(frac_of_missing_values, - kind='mergesort')[n:][::-1] - elif self.imputation_order == 'descending': - n = len(frac_of_missing_values) - len(missing_values_idx) - ordered_idx = np.argsort(frac_of_missing_values, - kind='mergesort')[n:] - elif self.imputation_order == 'random': - ordered_idx = missing_values_idx - self.random_state_.shuffle(ordered_idx) - else: - raise ValueError("Got an invalid imputation order: '{0}'. It must " - "be one of the following: 'roman', 'arabic', " - "'ascending', 'descending', or " - "'random'.".format(self.imputation_order)) - return ordered_idx - - def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): - """Get absolute correlation matrix between features. - - Parameters - ---------- - X_filled : ndarray, shape (n_samples, n_features) - Input data with the most recent imputations. - - tolerance : float, optional (default=1e-6) - ``abs_corr_mat`` can have nans, which will be replaced - with ``tolerance``. - - Returns - ------- - abs_corr_mat : ndarray, shape (n_features, n_features) - Absolute correlation matrix of ``X`` at the beginning of the - current round. The diagonal has been zeroed out and each feature's - absolute correlations with all others have been normalized to sum - to 1. - """ - n_features = X_filled.shape[1] - if (self.n_nearest_features is None or - self.n_nearest_features >= n_features): - return None - abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) - # np.corrcoef is not defined for features with zero std - abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance - # ensures exploration, i.e. at least some probability of sampling - np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) - # features are not their own neighbors - np.fill_diagonal(abs_corr_mat, 0) - # needs to sum to 1 for np.random.choice sampling - abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False) - return abs_corr_mat - - def _initial_imputation(self, X): - """Perform initial imputation for input X. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - Returns - ------- - Xt : ndarray, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - X_filled : ndarray, shape (n_samples, n_features) - Input data with the most recent imputations. - - mask_missing_values : ndarray, shape (n_samples, n_features) - Input data's missing indicator matrix, where "n_samples" is the - number of samples and "n_features" is the number of features. - """ - if is_scalar_nan(self.missing_values): - force_all_finite = "allow-nan" - else: - force_all_finite = True - - X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite=force_all_finite) - _check_inputs_dtype(X, self.missing_values) - - mask_missing_values = _get_mask(X, self.missing_values) - if self.initial_imputer_ is None: - self.initial_imputer_ = SimpleImputer( - missing_values=self.missing_values, - strategy=self.initial_strategy) - X_filled = self.initial_imputer_.fit_transform(X) - else: - X_filled = self.initial_imputer_.transform(X) - - valid_mask = np.flatnonzero(np.logical_not( - np.isnan(self.initial_imputer_.statistics_))) - Xt = X[:, valid_mask] - mask_missing_values = mask_missing_values[:, valid_mask] - - return Xt, X_filled, mask_missing_values - - def fit_transform(self, X, y=None): - """Fits the imputer on X and return the transformed X. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - y : ignored. - - Returns - ------- - Xt : array-like, shape (n_samples, n_features) - The imputed input data. - """ - self.random_state_ = getattr(self, "random_state_", - check_random_state(self.random_state)) - - if self.max_iter < 0: - raise ValueError( - "'max_iter' should be a positive integer. Got {} instead." - .format(self.max_iter)) - - if self.tol < 0: - raise ValueError( - "'tol' should be a non-negative float. Got {} instead." - .format(self.tol) - ) - - if self.add_indicator: - self.indicator_ = MissingIndicator( - missing_values=self.missing_values) - X_trans_indicator = self.indicator_.fit_transform(X) - else: - self.indicator_ = None - - if self.estimator is None: - from .linear_model import BayesianRidge - self._estimator = BayesianRidge() - else: - self._estimator = clone(self.estimator) - - self.imputation_sequence_ = [] - - if hasattr(self._estimator, 'random_state'): - self._estimator.random_state = self.random_state_ - - self._min_value = -np.inf if self.min_value is None else self.min_value - self._max_value = np.inf if self.max_value is None else self.max_value - - self.initial_imputer_ = None - X, Xt, mask_missing_values = self._initial_imputation(X) - - if self.max_iter == 0 or np.all(mask_missing_values): - self.n_iter_ = 0 - return Xt - - # order in which to impute - # note this is probably too slow for large feature data (d > 100000) - # and a better way would be good. - # see: https://goo.gl/KyCNwj and subsequent comments - ordered_idx = self._get_ordered_idx(mask_missing_values) - self.n_features_with_missing_ = len(ordered_idx) - - abs_corr_mat = self._get_abs_corr_mat(Xt) - - n_samples, n_features = Xt.shape - if self.verbose > 0: - print("[IterativeImputer] Completing matrix with shape %s" - % (X.shape,)) - start_t = time() - if not self.sample_posterior: - Xt_previous = Xt.copy() - normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) - for self.n_iter_ in range(1, self.max_iter + 1): - if self.imputation_order == 'random': - ordered_idx = self._get_ordered_idx(mask_missing_values) - - for feat_idx in ordered_idx: - neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, - feat_idx, - abs_corr_mat) - Xt, estimator = self._impute_one_feature( - Xt, mask_missing_values, feat_idx, neighbor_feat_idx, - estimator=None, fit_mode=True) - estimator_triplet = ImputerTriplet(feat_idx, - neighbor_feat_idx, - estimator) - self.imputation_sequence_.append(estimator_triplet) - - if self.verbose > 1: - print('[IterativeImputer] Ending imputation round ' - '%d/%d, elapsed time %0.2f' - % (self.n_iter_, self.max_iter, time() - start_t)) - - if not self.sample_posterior: - inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, - axis=None) - if inf_norm < normalized_tol: - if self.verbose > 0: - print('[IterativeImputer] Early stopping criterion ' - 'reached.') - break - Xt_previous = Xt.copy() - else: - if not self.sample_posterior: - warnings.warn("[IterativeImputer] Early stopping criterion not" - " reached.", ConvergenceWarning) - Xt[~mask_missing_values] = X[~mask_missing_values] - - if self.add_indicator: - Xt = np.hstack((Xt, X_trans_indicator)) - return Xt - - def transform(self, X): - """Imputes all missing values in X. - - Note that this is stochastic, and that if random_state is not fixed, - repeated calls, or permuted input, will yield different results. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - The input data to complete. - - Returns - ------- - Xt : array-like, shape (n_samples, n_features) - The imputed input data. - """ - check_is_fitted(self, 'initial_imputer_') - - if self.add_indicator: - X_trans_indicator = self.indicator_.transform(X) - - X, Xt, mask_missing_values = self._initial_imputation(X) - - if self.n_iter_ == 0 or np.all(mask_missing_values): - return Xt - - imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ - i_rnd = 0 - if self.verbose > 0: - print("[IterativeImputer] Completing matrix with shape %s" - % (X.shape,)) - start_t = time() - for it, estimator_triplet in enumerate(self.imputation_sequence_): - Xt, _ = self._impute_one_feature( - Xt, - mask_missing_values, - estimator_triplet.feat_idx, - estimator_triplet.neighbor_feat_idx, - estimator=estimator_triplet.estimator, - fit_mode=False - ) - if not (it + 1) % imputations_per_round: - if self.verbose > 1: - print('[IterativeImputer] Ending imputation round ' - '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, self.n_iter_, time() - start_t)) - i_rnd += 1 - - Xt[~mask_missing_values] = X[~mask_missing_values] - - if self.add_indicator: - Xt = np.hstack((Xt, X_trans_indicator)) - return Xt - - def fit(self, X, y=None): - """Fits the imputer on X and return self. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - y : ignored - - Returns - ------- - self : object - Returns self. - """ - self.fit_transform(X) - return self - - def _more_tags(self): - return {'allow_nan': True} - - -class MissingIndicator(BaseEstimator, TransformerMixin): - """Binary indicators for missing values. - - Note that this component typically should not be used in a vanilla - :class:`Pipeline` consisting of transformers and a classifier, but rather - could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - missing_values : number, string, np.nan (default) or None - The placeholder for the missing values. All occurrences of - `missing_values` will be indicated (True in the output array), the - other values will be marked as False. - - features : str, optional - Whether the imputer mask should represent all or a subset of - features. - - - If "missing-only" (default), the imputer mask will only represent - features containing missing values during fit time. - - If "all", the imputer mask will represent all features. - - sparse : boolean or "auto", optional - Whether the imputer mask format should be sparse or dense. - - - If "auto" (default), the imputer mask will be of same type as - input. - - If True, the imputer mask will be a sparse matrix. - - If False, the imputer mask will be a numpy array. - - error_on_new : boolean, optional - If True (default), transform will raise an error when there are - features with missing values in transform that have no missing values - in fit. This is applicable only when ``features="missing-only"``. - - Attributes - ---------- - features_ : ndarray, shape (n_missing_features,) or (n_features,) - The features indices which will be returned when calling ``transform``. - They are computed during ``fit``. For ``features='all'``, it is - to ``range(n_features)``. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.impute import MissingIndicator - >>> X1 = np.array([[np.nan, 1, 3], - ... [4, 0, np.nan], - ... [8, 1, 0]]) - >>> X2 = np.array([[5, 1, np.nan], - ... [np.nan, 2, 3], - ... [2, 4, 0]]) - >>> indicator = MissingIndicator() - >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE - MissingIndicator(error_on_new=True, features='missing-only', - missing_values=nan, sparse='auto') - >>> X2_tr = indicator.transform(X2) - >>> X2_tr - array([[False, True], - [ True, False], - [False, False]]) - - """ - - def __init__(self, missing_values=np.nan, features="missing-only", - sparse="auto", error_on_new=True): - self.missing_values = missing_values - self.features = features - self.sparse = sparse - self.error_on_new = error_on_new - - def _get_missing_features_info(self, X): - """Compute the imputer mask and the indices of the features - containing missing values. - - Parameters - ---------- - X : {ndarray or sparse matrix}, shape (n_samples, n_features) - The input data with missing values. Note that ``X`` has been - checked in ``fit`` and ``transform`` before to call this function. - - Returns - ------- - imputer_mask : {ndarray or sparse matrix}, shape \ -(n_samples, n_features) or (n_samples, n_features_with_missing) - The imputer mask of the original data. - - features_with_missing : ndarray, shape (n_features_with_missing) - The features containing missing values. - - """ - if sparse.issparse(X): - mask = _get_mask(X.data, self.missing_values) - - # The imputer mask will be constructed with the same sparse format - # as X. - sparse_constructor = (sparse.csr_matrix if X.format == 'csr' - else sparse.csc_matrix) - imputer_mask = sparse_constructor( - (mask, X.indices.copy(), X.indptr.copy()), - shape=X.shape, dtype=bool) - imputer_mask.eliminate_zeros() - - if self.features == 'missing-only': - n_missing = imputer_mask.getnnz(axis=0) - - if self.sparse is False: - imputer_mask = imputer_mask.toarray() - elif imputer_mask.format == 'csr': - imputer_mask = imputer_mask.tocsc() - else: - imputer_mask = _get_mask(X, self.missing_values) - - if self.features == 'missing-only': - n_missing = imputer_mask.sum(axis=0) - - if self.sparse is True: - imputer_mask = sparse.csc_matrix(imputer_mask) - - if self.features == 'all': - features_indices = np.arange(X.shape[1]) - else: - features_indices = np.flatnonzero(n_missing) - - return imputer_mask, features_indices - - def _validate_input(self, X): - if not is_scalar_nan(self.missing_values): - force_all_finite = True - else: - force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, - force_all_finite=force_all_finite) - _check_inputs_dtype(X, self.missing_values) - if X.dtype.kind not in ("i", "u", "f", "O"): - raise ValueError("MissingIndicator does not support data with " - "dtype {0}. Please provide either a numeric array" - " (with a floating point or integer dtype) or " - "categorical data represented either as an array " - "with integer dtype or an array of string values " - "with an object dtype.".format(X.dtype)) - - if sparse.issparse(X) and self.missing_values == 0: - # missing_values = 0 not allowed with sparse data as it would - # force densification - raise ValueError("Sparse input with missing_values=0 is " - "not supported. Provide a dense " - "array instead.") - - return X - - def fit(self, X, y=None): - """Fit the transformer on X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : object - Returns self. - """ - X = self._validate_input(X) - self._n_features = X.shape[1] - - if self.features not in ('missing-only', 'all'): - raise ValueError("'features' has to be either 'missing-only' or " - "'all'. Got {} instead.".format(self.features)) - - if not ((isinstance(self.sparse, str) and - self.sparse == "auto") or isinstance(self.sparse, bool)): - raise ValueError("'sparse' has to be a boolean or 'auto'. " - "Got {!r} instead.".format(self.sparse)) - - self.features_ = self._get_missing_features_info(X)[1] - - return self - - def transform(self, X): - """Generate missing values indicator for X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The input data to complete. - - Returns - ------- - Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) - The missing indicator for input data. The data type of ``Xt`` - will be boolean. - - """ - check_is_fitted(self, "features_") - X = self._validate_input(X) - - if X.shape[1] != self._n_features: - raise ValueError("X has a different number of features " - "than during fitting.") - - imputer_mask, features = self._get_missing_features_info(X) - - if self.features == "missing-only": - features_diff_fit_trans = np.setdiff1d(features, self.features_) - if (self.error_on_new and features_diff_fit_trans.size > 0): - raise ValueError("The features {} have missing values " - "in transform but have no missing values " - "in fit.".format(features_diff_fit_trans)) - - if self.features_.size < self._n_features: - imputer_mask = imputer_mask[:, self.features_] - - return imputer_mask - - def fit_transform(self, X, y=None): - """Generate missing values indicator for X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The input data to complete. - - Returns - ------- - Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) - The missing indicator for input data. The data type of ``Xt`` - will be boolean. - - """ - return self.fit(X, y).transform(X) - - def _more_tags(self): - return {'allow_nan': True, - 'X_types': ['2darray', 'str']} diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py new file mode 100644 index 0000000000000..abeb4d471f5f3 --- /dev/null +++ b/sklearn/impute/__init__.py @@ -0,0 +1,8 @@ +"""Transformers for missing value imputation""" + +from ._base import MissingIndicator, SimpleImputer + +__all__ = [ + 'MissingIndicator', + 'SimpleImputer', +] diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py new file mode 100644 index 0000000000000..7be9da691ce11 --- /dev/null +++ b/sklearn/impute/_base.py @@ -0,0 +1,675 @@ +# Authors: Nicolas Tresegnie +# Sergey Feldman +# License: BSD 3 clause + +from __future__ import division + +import warnings +import numbers + +import numpy as np +import numpy.ma as ma +from scipy import sparse +from scipy import stats + +from ..base import BaseEstimator, TransformerMixin +from ..utils.sparsefuncs import _get_median +from ..utils.validation import check_is_fitted +from ..utils.validation import FLOAT_DTYPES +from ..utils.fixes import _object_dtype_isnan +from ..utils import is_scalar_nan +from ..utils import check_array + + +def _check_inputs_dtype(X, missing_values): + if (X.dtype.kind in ("f", "i", "u") and + not isinstance(missing_values, numbers.Real)): + raise ValueError("'X' and 'missing_values' types are expected to be" + " both numerical. Got X.dtype={} and " + " type(missing_values)={}." + .format(X.dtype, type(missing_values))) + + +def _get_mask(X, value_to_mask): + """Compute the boolean mask X == missing_values.""" + if is_scalar_nan(value_to_mask): + if X.dtype.kind == "f": + return np.isnan(X) + elif X.dtype.kind in ("i", "u"): + # can't have NaNs in integer array. + return np.zeros(X.shape, dtype=bool) + else: + # np.isnan does not work on object dtypes. + return _object_dtype_isnan(X) + else: + # X == value_to_mask with object dytpes does not always perform + # element-wise for old versions of numpy + return np.equal(X, value_to_mask) + + +def _most_frequent(array, extra_value, n_repeat): + """Compute the most frequent value in a 1d array extended with + [extra_value] * n_repeat, where extra_value is assumed to be not part + of the array.""" + # Compute the most frequent value in array only + if array.size > 0: + with warnings.catch_warnings(): + # stats.mode raises a warning when input array contains objects due + # to incapacity to detect NaNs. Irrelevant here since input array + # has already been NaN-masked. + warnings.simplefilter("ignore", RuntimeWarning) + mode = stats.mode(array) + + most_frequent_value = mode[0][0] + most_frequent_count = mode[1][0] + else: + most_frequent_value = 0 + most_frequent_count = 0 + + # Compare to array + [extra_value] * n_repeat + if most_frequent_count == 0 and n_repeat == 0: + return np.nan + elif most_frequent_count < n_repeat: + return extra_value + elif most_frequent_count > n_repeat: + return most_frequent_value + elif most_frequent_count == n_repeat: + # Ties the breaks. Copy the behaviour of scipy.stats.mode + if most_frequent_value < extra_value: + return most_frequent_value + else: + return extra_value + + +class SimpleImputer(BaseEstimator, TransformerMixin): + """Imputation transformer for completing missing values. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + missing_values : number, string, np.nan (default) or None + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. + + strategy : string, optional (default="mean") + The imputation strategy. + + - If "mean", then replace missing values using the mean along + each column. Can only be used with numeric data. + - If "median", then replace missing values using the median along + each column. Can only be used with numeric data. + - If "most_frequent", then replace missing using the most frequent + value along each column. Can be used with strings or numeric data. + - If "constant", then replace missing values with fill_value. Can be + used with strings or numeric data. + + .. versionadded:: 0.20 + strategy="constant" for fixed value imputation. + + fill_value : string or numerical value, optional (default=None) + When strategy == "constant", fill_value is used to replace all + occurrences of missing_values. + If left to the default, fill_value will be 0 when imputing numerical + data and "missing_value" for strings or object data types. + + verbose : integer, optional (default=0) + Controls the verbosity of the imputer. + + copy : boolean, optional (default=True) + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. Note that, in the following cases, + a new copy will always be made, even if `copy=False`: + + - If X is not an array of floating values; + - If X is encoded as a CSR matrix; + - If add_indicator=True. + + add_indicator : boolean, optional (default=False) + If True, a `MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. + + Attributes + ---------- + statistics_ : array of shape (n_features,) + The imputation fill value for each feature. + + indicator_ : :class:`sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + ``None`` if add_indicator is False. + + See also + -------- + IterativeImputer : Multivariate imputation of missing values. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import SimpleImputer + >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') + >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) + ... # doctest: +NORMALIZE_WHITESPACE + SimpleImputer(add_indicator=False, copy=True, fill_value=None, + missing_values=nan, strategy='mean', verbose=0) + >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] + >>> print(imp_mean.transform(X)) + ... # doctest: +NORMALIZE_WHITESPACE + [[ 7. 2. 3. ] + [ 4. 3.5 6. ] + [10. 3.5 9. ]] + + Notes + ----- + Columns which only contained missing values at `fit` are discarded upon + `transform` if strategy is not "constant". + + """ + def __init__(self, missing_values=np.nan, strategy="mean", + fill_value=None, verbose=0, copy=True, add_indicator=False): + self.missing_values = missing_values + self.strategy = strategy + self.fill_value = fill_value + self.verbose = verbose + self.copy = copy + self.add_indicator = add_indicator + + def _validate_input(self, X): + allowed_strategies = ["mean", "median", "most_frequent", "constant"] + if self.strategy not in allowed_strategies: + raise ValueError("Can only use these strategies: {0} " + " got strategy={1}".format(allowed_strategies, + self.strategy)) + + if self.strategy in ("most_frequent", "constant"): + dtype = None + else: + dtype = FLOAT_DTYPES + + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" + + try: + X = check_array(X, accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, copy=self.copy) + except ValueError as ve: + if "could not convert" in str(ve): + raise ValueError("Cannot use {0} strategy with non-numeric " + "data. Received datatype :{1}." + "".format(self.strategy, X.dtype.kind)) + else: + raise ve + + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError("SimpleImputer does not support data with dtype " + "{0}. Please provide either a numeric array (with" + " a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype)) + + return X + + def fit(self, X, y=None): + """Fit the imputer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : SimpleImputer + """ + X = self._validate_input(X) + + # default fill_value is 0 for numerical input and "missing_value" + # otherwise + if self.fill_value is None: + if X.dtype.kind in ("i", "u", "f"): + fill_value = 0 + else: + fill_value = "missing_value" + else: + fill_value = self.fill_value + + # fill_value should be numerical in case of numerical input + if (self.strategy == "constant" and + X.dtype.kind in ("i", "u", "f") and + not isinstance(fill_value, numbers.Real)): + raise ValueError("'fill_value'={0} is invalid. Expected a " + "numerical value when imputing numerical " + "data".format(fill_value)) + + if sparse.issparse(X): + # missing_values = 0 not allowed with sparse data as it would + # force densification + if self.missing_values == 0: + raise ValueError("Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead.") + else: + self.statistics_ = self._sparse_fit(X, + self.strategy, + self.missing_values, + fill_value) + else: + self.statistics_ = self._dense_fit(X, + self.strategy, + self.missing_values, + fill_value) + + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + self.indicator_.fit(X) + else: + self.indicator_ = None + + return self + + def _sparse_fit(self, X, strategy, missing_values, fill_value): + """Fit the transformer on sparse data.""" + mask_data = _get_mask(X.data, missing_values) + n_implicit_zeros = X.shape[0] - np.diff(X.indptr) + + statistics = np.empty(X.shape[1]) + + if strategy == "constant": + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column + statistics.fill(fill_value) + else: + for i in range(X.shape[1]): + column = X.data[X.indptr[i]:X.indptr[i + 1]] + mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] + column = column[~mask_column] + + # combine explicit and implicit zeros + mask_zeros = _get_mask(column, 0) + column = column[~mask_zeros] + n_explicit_zeros = mask_zeros.sum() + n_zeros = n_implicit_zeros[i] + n_explicit_zeros + + if strategy == "mean": + s = column.size + n_zeros + statistics[i] = np.nan if s == 0 else column.sum() / s + + elif strategy == "median": + statistics[i] = _get_median(column, + n_zeros) + + elif strategy == "most_frequent": + statistics[i] = _most_frequent(column, + 0, + n_zeros) + return statistics + + def _dense_fit(self, X, strategy, missing_values, fill_value): + """Fit the transformer on dense data.""" + mask = _get_mask(X, missing_values) + masked_X = ma.masked_array(X, mask=mask) + + # Mean + if strategy == "mean": + mean_masked = np.ma.mean(masked_X, axis=0) + # Avoid the warning "Warning: converting a masked element to nan." + mean = np.ma.getdata(mean_masked) + mean[np.ma.getmask(mean_masked)] = np.nan + + return mean + + # Median + elif strategy == "median": + median_masked = np.ma.median(masked_X, axis=0) + # Avoid the warning "Warning: converting a masked element to nan." + median = np.ma.getdata(median_masked) + median[np.ma.getmaskarray(median_masked)] = np.nan + + return median + + # Most frequent + elif strategy == "most_frequent": + # scipy.stats.mstats.mode cannot be used because it will no work + # properly if the first element is masked and if its frequency + # is equal to the frequency of the most frequent valid element + # See https://github.com/scipy/scipy/issues/2636 + + # To be able access the elements by columns + X = X.transpose() + mask = mask.transpose() + + if X.dtype.kind == "O": + most_frequent = np.empty(X.shape[0], dtype=object) + else: + most_frequent = np.empty(X.shape[0]) + + for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): + row_mask = np.logical_not(row_mask).astype(np.bool) + row = row[row_mask] + most_frequent[i] = _most_frequent(row, np.nan, 0) + + return most_frequent + + # Constant + elif strategy == "constant": + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column + return np.full(X.shape[1], fill_value, dtype=X.dtype) + + def transform(self, X): + """Impute all missing values in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + """ + check_is_fitted(self, 'statistics_') + + X = self._validate_input(X) + + statistics = self.statistics_ + + if X.shape[1] != statistics.shape[0]: + raise ValueError("X has %d features per sample, expected %d" + % (X.shape[1], self.statistics_.shape[0])) + + if self.add_indicator: + X_trans_indicator = self.indicator_.transform(X) + + # Delete the invalid columns if strategy is not constant + if self.strategy == "constant": + valid_statistics = statistics + else: + # same as np.isnan but also works for object dtypes + invalid_mask = _get_mask(statistics, np.nan) + valid_mask = np.logical_not(invalid_mask) + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.flatnonzero(valid_mask) + + if invalid_mask.any(): + missing = np.arange(X.shape[1])[invalid_mask] + if self.verbose: + warnings.warn("Deleting features without " + "observed values: %s" % missing) + X = X[:, valid_statistics_indexes] + + # Do actual imputation + if sparse.issparse(X): + if self.missing_values == 0: + raise ValueError("Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead.") + else: + mask = _get_mask(X.data, self.missing_values) + indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), + np.diff(X.indptr))[mask] + + X.data[mask] = valid_statistics[indexes].astype(X.dtype, + copy=False) + else: + mask = _get_mask(X, self.missing_values) + n_missing = np.sum(mask, axis=0) + values = np.repeat(valid_statistics, n_missing) + coordinates = np.where(mask.transpose())[::-1] + + X[coordinates] = values + + if self.add_indicator: + hstack = sparse.hstack if sparse.issparse(X) else np.hstack + X = hstack((X, X_trans_indicator)) + + return X + + def _more_tags(self): + return {'allow_nan': True} + + +class MissingIndicator(BaseEstimator, TransformerMixin): + """Binary indicators for missing values. + + Note that this component typically should not be used in a vanilla + :class:`Pipeline` consisting of transformers and a classifier, but rather + could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + missing_values : number, string, np.nan (default) or None + The placeholder for the missing values. All occurrences of + `missing_values` will be indicated (True in the output array), the + other values will be marked as False. + + features : str, optional + Whether the imputer mask should represent all or a subset of + features. + + - If "missing-only" (default), the imputer mask will only represent + features containing missing values during fit time. + - If "all", the imputer mask will represent all features. + + sparse : boolean or "auto", optional + Whether the imputer mask format should be sparse or dense. + + - If "auto" (default), the imputer mask will be of same type as + input. + - If True, the imputer mask will be a sparse matrix. + - If False, the imputer mask will be a numpy array. + + error_on_new : boolean, optional + If True (default), transform will raise an error when there are + features with missing values in transform that have no missing values + in fit. This is applicable only when ``features="missing-only"``. + + Attributes + ---------- + features_ : ndarray, shape (n_missing_features,) or (n_features,) + The features indices which will be returned when calling ``transform``. + They are computed during ``fit``. For ``features='all'``, it is + to ``range(n_features)``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import MissingIndicator + >>> X1 = np.array([[np.nan, 1, 3], + ... [4, 0, np.nan], + ... [8, 1, 0]]) + >>> X2 = np.array([[5, 1, np.nan], + ... [np.nan, 2, 3], + ... [2, 4, 0]]) + >>> indicator = MissingIndicator() + >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE + MissingIndicator(error_on_new=True, features='missing-only', + missing_values=nan, sparse='auto') + >>> X2_tr = indicator.transform(X2) + >>> X2_tr + array([[False, True], + [ True, False], + [False, False]]) + + """ + + def __init__(self, missing_values=np.nan, features="missing-only", + sparse="auto", error_on_new=True): + self.missing_values = missing_values + self.features = features + self.sparse = sparse + self.error_on_new = error_on_new + + def _get_missing_features_info(self, X): + """Compute the imputer mask and the indices of the features + containing missing values. + + Parameters + ---------- + X : {ndarray or sparse matrix}, shape (n_samples, n_features) + The input data with missing values. Note that ``X`` has been + checked in ``fit`` and ``transform`` before to call this function. + + Returns + ------- + imputer_mask : {ndarray or sparse matrix}, shape \ +(n_samples, n_features) or (n_samples, n_features_with_missing) + The imputer mask of the original data. + + features_with_missing : ndarray, shape (n_features_with_missing) + The features containing missing values. + + """ + if sparse.issparse(X): + mask = _get_mask(X.data, self.missing_values) + + # The imputer mask will be constructed with the same sparse format + # as X. + sparse_constructor = (sparse.csr_matrix if X.format == 'csr' + else sparse.csc_matrix) + imputer_mask = sparse_constructor( + (mask, X.indices.copy(), X.indptr.copy()), + shape=X.shape, dtype=bool) + imputer_mask.eliminate_zeros() + + if self.features == 'missing-only': + n_missing = imputer_mask.getnnz(axis=0) + + if self.sparse is False: + imputer_mask = imputer_mask.toarray() + elif imputer_mask.format == 'csr': + imputer_mask = imputer_mask.tocsc() + else: + imputer_mask = _get_mask(X, self.missing_values) + + if self.features == 'missing-only': + n_missing = imputer_mask.sum(axis=0) + + if self.sparse is True: + imputer_mask = sparse.csc_matrix(imputer_mask) + + if self.features == 'all': + features_indices = np.arange(X.shape[1]) + else: + features_indices = np.flatnonzero(n_missing) + + return imputer_mask, features_indices + + def _validate_input(self, X): + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError("MissingIndicator does not support data with " + "dtype {0}. Please provide either a numeric array" + " (with a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype)) + + if sparse.issparse(X) and self.missing_values == 0: + # missing_values = 0 not allowed with sparse data as it would + # force densification + raise ValueError("Sparse input with missing_values=0 is " + "not supported. Provide a dense " + "array instead.") + + return X + + def fit(self, X, y=None): + """Fit the transformer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : object + Returns self. + """ + X = self._validate_input(X) + self._n_features = X.shape[1] + + if self.features not in ('missing-only', 'all'): + raise ValueError("'features' has to be either 'missing-only' or " + "'all'. Got {} instead.".format(self.features)) + + if not ((isinstance(self.sparse, str) and + self.sparse == "auto") or isinstance(self.sparse, bool)): + raise ValueError("'sparse' has to be a boolean or 'auto'. " + "Got {!r} instead.".format(self.sparse)) + + self.features_ = self._get_missing_features_info(X)[1] + + return self + + def transform(self, X): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) + The missing indicator for input data. The data type of ``Xt`` + will be boolean. + + """ + check_is_fitted(self, "features_") + X = self._validate_input(X) + + if X.shape[1] != self._n_features: + raise ValueError("X has a different number of features " + "than during fitting.") + + imputer_mask, features = self._get_missing_features_info(X) + + if self.features == "missing-only": + features_diff_fit_trans = np.setdiff1d(features, self.features_) + if (self.error_on_new and features_diff_fit_trans.size > 0): + raise ValueError("The features {} have missing values " + "in transform but have no missing values " + "in fit.".format(features_diff_fit_trans)) + + if self.features_.size < self._n_features: + imputer_mask = imputer_mask[:, self.features_] + + return imputer_mask + + def fit_transform(self, X, y=None): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) + The missing indicator for input data. The data type of ``Xt`` + will be boolean. + + """ + return self.fit(X, y).transform(X) + + def _more_tags(self): + return {'allow_nan': True, + 'X_types': ['2darray', 'str']} diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py new file mode 100644 index 0000000000000..40df3f4059c04 --- /dev/null +++ b/sklearn/impute/_iterative.py @@ -0,0 +1,680 @@ + +from time import time +from distutils.version import LooseVersion +from collections import namedtuple +import warnings + +import scipy +from scipy import stats +import numpy as np + +from ..base import clone, BaseEstimator, TransformerMixin +from ..exceptions import ConvergenceWarning +from ..preprocessing import normalize +from ..utils import check_array, check_random_state, safe_indexing +from ..utils.validation import FLOAT_DTYPES, check_is_fitted +from ..utils import is_scalar_nan + +from ._base import (_get_mask, MissingIndicator, SimpleImputer, + _check_inputs_dtype) + + +_ImputerTriplet = namedtuple('_ImputerTriplet', ['feat_idx', + 'neighbor_feat_idx', + 'estimator']) + + +class IterativeImputer(BaseEstimator, TransformerMixin): + """Multivariate imputer that estimates each feature from all the others. + + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. + + Read more in the :ref:`User Guide `. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_iterative_imputer``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from sklearn.impute + >>> from sklearn.impute import IterativeImputer + + Parameters + ---------- + estimator : estimator object, default=BayesianRidge() + The estimator to use at each step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support + ``return_std`` in its ``predict`` method. + + missing_values : int, np.nan, optional (default=np.nan) + The placeholder for the missing values. All occurrences of + ``missing_values`` will be imputed. + + sample_posterior : boolean, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted estimator for each imputation. Estimator must support + ``return_std`` in its ``predict`` method if set to ``True``. Set to + ``True`` if using ``IterativeImputer`` for multiple imputations. + + max_iter : int, optional (default=10) + Maximum number of imputation rounds to perform before returning the + imputations computed during the final round. A round is a single + imputation of each feature with missing values. The stopping criterion + is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol, + where `X_t` is `X` at iteration `t. Note that early stopping is only + applied if ``sample_posterior=False``. + + tol : float, optional (default=1e-3) + Tolerance of the stopping condition. + + n_nearest_features : int, optional (default=None) + Number of other features to use to estimate the missing values of + each feature column. Nearness between features is measured using + the absolute correlation coefficient between each feature pair (after + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If ``None``, all features will be used. + + initial_strategy : str, optional (default="mean") + Which strategy to use to initialize the missing values. Same as the + ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` + Valid values: {"mean", "median", "most_frequent", or "constant"}. + + imputation_order : str, optional (default="ascending") + The order in which the features will be imputed. Possible values: + + "ascending" + From features with fewest missing values to most. + "descending" + From features with most missing values to fewest. + "roman" + Left to right. + "arabic" + Right to left. + "random" + A random order for each round. + + min_value : float, optional (default=None) + Minimum possible imputed value. Default of ``None`` will set minimum + to negative infinity. + + max_value : float, optional (default=None) + Maximum possible imputed value. Default of ``None`` will set maximum + to positive infinity. + + verbose : int, optional (default=0) + Verbosity flag, controls the debug messages that are issued + as functions are evaluated. The higher, the more verbose. Can be 0, 1, + or 2. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use. Randomizes + selection of estimator features if n_nearest_features is not None, the + ``imputation_order`` if ``random``, and the sampling from posterior if + ``sample_posterior`` is True. Use an integer for determinism. + See :term:`the Glossary `. + + add_indicator : boolean, optional (default=False) + If True, a `MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. + + Attributes + ---------- + initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. + + imputation_sequence_ : list of tuples + Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where + ``feat_idx`` is the current feature to be imputed, + ``neighbor_feat_idx`` is the array of other features used to impute the + current feature, and ``estimator`` is the trained estimator used for + the imputation. Length is ``self.n_features_with_missing_ * + self.n_iter_``. + + n_iter_ : int + Number of iteration rounds that occurred. Will be less than + ``self.max_iter`` if early stopping criterion was reached. + + n_features_with_missing_ : int + Number of features with missing values. + + indicator_ : :class:`sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + ``None`` if add_indicator is False. + + See also + -------- + SimpleImputer : Univariate imputation of missing values. + + Notes + ----- + To support imputation in inductive mode we store each feature's estimator + during the ``fit`` phase, and predict without refitting (in order) during + the ``transform`` phase. + + Features which contain all missing values at ``fit`` are discarded upon + ``transform``. + + Features with missing values during ``transform`` which did not have any + missing values during ``fit`` will be imputed with the initial imputation + method only. + + References + ---------- + .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: + Multivariate Imputation by Chained Equations in R". Journal of + Statistical Software 45: 1-67. + `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ + """ + + def __init__(self, + estimator=None, + missing_values=np.nan, + sample_posterior=False, + max_iter=10, + tol=1e-3, + n_nearest_features=None, + initial_strategy="mean", + imputation_order='ascending', + min_value=None, + max_value=None, + verbose=0, + random_state=None, + add_indicator=False): + + self.estimator = estimator + self.missing_values = missing_values + self.sample_posterior = sample_posterior + self.max_iter = max_iter + self.tol = tol + self.n_nearest_features = n_nearest_features + self.initial_strategy = initial_strategy + self.imputation_order = imputation_order + self.min_value = min_value + self.max_value = max_value + self.verbose = verbose + self.random_state = random_state + self.add_indicator = add_indicator + + def _impute_one_feature(self, + X_filled, + mask_missing_values, + feat_idx, + neighbor_feat_idx, + estimator=None, + fit_mode=True): + """Impute a single feature from the others provided. + + This function predicts the missing values of one of the features using + the current estimates of all the other features. The ``estimator`` must + support ``return_std=True`` in its ``predict`` method for this function + to work. + + Parameters + ---------- + X_filled : ndarray + Input data with the most recent imputations. + + mask_missing_values : ndarray + Input data's missing indicator matrix. + + feat_idx : int + Index of the feature currently being imputed. + + neighbor_feat_idx : ndarray + Indices of the features to be used in imputing ``feat_idx``. + + estimator : object + The estimator to use at this step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support + ``return_std`` in its ``predict`` method. + If None, it will be cloned from self._estimator. + + fit_mode : boolean, default=True + Whether to fit and predict with the estimator or just predict. + + Returns + ------- + X_filled : ndarray + Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. + + estimator : estimator with sklearn API + The fitted estimator used to impute + ``X_filled[missing_row_mask, feat_idx]``. + """ + + # if nothing is missing, just return the default + # (should not happen at fit time because feat_ids would be excluded) + missing_row_mask = mask_missing_values[:, feat_idx] + if not np.any(missing_row_mask): + return X_filled, estimator + + if estimator is None and fit_mode is False: + raise ValueError("If fit_mode is False, then an already-fitted " + "estimator should be passed in.") + + if estimator is None: + estimator = clone(self._estimator) + + if fit_mode: + X_train = safe_indexing(X_filled[:, neighbor_feat_idx], + ~missing_row_mask) + y_train = safe_indexing(X_filled[:, feat_idx], + ~missing_row_mask) + estimator.fit(X_train, y_train) + + # get posterior samples + X_test = safe_indexing(X_filled[:, neighbor_feat_idx], + missing_row_mask) + if self.sample_posterior: + mus, sigmas = estimator.predict(X_test, return_std=True) + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + # two types of problems: (1) non-positive sigmas, (2) mus outside + # legal range of min_value and max_value (results in inf sample) + positive_sigmas = sigmas > 0 + imputed_values[~positive_sigmas] = mus[~positive_sigmas] + mus_too_low = mus < self._min_value + imputed_values[mus_too_low] = self._min_value + mus_too_high = mus > self._max_value + imputed_values[mus_too_high] = self._max_value + # the rest can be sampled without statistical issues + inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high + mus = mus[inrange_mask] + sigmas = sigmas[inrange_mask] + a = (self._min_value - mus) / sigmas + b = (self._max_value - mus) / sigmas + + if scipy.__version__ < LooseVersion('0.18'): + # bug with vector-valued `a` in old scipy + imputed_values[inrange_mask] = [ + stats.truncnorm(a=a_, b=b_, + loc=loc_, scale=scale_).rvs( + random_state=self.random_state_) + for a_, b_, loc_, scale_ + in zip(a, b, mus, sigmas)] + else: + truncated_normal = stats.truncnorm(a=a, b=b, + loc=mus, scale=sigmas) + imputed_values[inrange_mask] = truncated_normal.rvs( + random_state=self.random_state_) + else: + imputed_values = estimator.predict(X_test) + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) + + # update the feature + X_filled[missing_row_mask, feat_idx] = imputed_values + return X_filled, estimator + + def _get_neighbor_feat_idx(self, + n_features, + feat_idx, + abs_corr_mat): + """Get a list of other features to predict ``feat_idx``. + + If self.n_nearest_features is less than or equal to the total + number of features, then use a probability proportional to the absolute + correlation between ``feat_idx`` and each other feature to randomly + choose a subsample of the other features (without replacement). + + Parameters + ---------- + n_features : int + Number of features in ``X``. + + feat_idx : int + Index of the feature currently being imputed. + + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X``. The diagonal has been zeroed + out and each feature has been normalized to sum to 1. Can be None. + + Returns + ------- + neighbor_feat_idx : array-like + The features to use to impute ``feat_idx``. + """ + if (self.n_nearest_features is not None and + self.n_nearest_features < n_features): + p = abs_corr_mat[:, feat_idx] + neighbor_feat_idx = self.random_state_.choice( + np.arange(n_features), self.n_nearest_features, replace=False, + p=p) + else: + inds_left = np.arange(feat_idx) + inds_right = np.arange(feat_idx + 1, n_features) + neighbor_feat_idx = np.concatenate((inds_left, inds_right)) + return neighbor_feat_idx + + def _get_ordered_idx(self, mask_missing_values): + """Decide in what order we will update the features. + + As a homage to the MICE R package, we will have 4 main options of + how to order the updates, and use a random order if anything else + is specified. + + Also, this function skips features which have no missing values. + + Parameters + ---------- + mask_missing_values : array-like, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + + Returns + ------- + ordered_idx : ndarray, shape (n_features,) + The order in which to impute the features. + """ + frac_of_missing_values = mask_missing_values.mean(axis=0) + missing_values_idx = np.nonzero(frac_of_missing_values)[0] + if self.imputation_order == 'roman': + ordered_idx = missing_values_idx + elif self.imputation_order == 'arabic': + ordered_idx = missing_values_idx[::-1] + elif self.imputation_order == 'ascending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:][::-1] + elif self.imputation_order == 'descending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:] + elif self.imputation_order == 'random': + ordered_idx = missing_values_idx + self.random_state_.shuffle(ordered_idx) + else: + raise ValueError("Got an invalid imputation order: '{0}'. It must " + "be one of the following: 'roman', 'arabic', " + "'ascending', 'descending', or " + "'random'.".format(self.imputation_order)) + return ordered_idx + + def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): + """Get absolute correlation matrix between features. + + Parameters + ---------- + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + tolerance : float, optional (default=1e-6) + ``abs_corr_mat`` can have nans, which will be replaced + with ``tolerance``. + + Returns + ------- + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X`` at the beginning of the + current round. The diagonal has been zeroed out and each feature's + absolute correlations with all others have been normalized to sum + to 1. + """ + n_features = X_filled.shape[1] + if (self.n_nearest_features is None or + self.n_nearest_features >= n_features): + return None + abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) + # np.corrcoef is not defined for features with zero std + abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance + # ensures exploration, i.e. at least some probability of sampling + np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) + # features are not their own neighbors + np.fill_diagonal(abs_corr_mat, 0) + # needs to sum to 1 for np.random.choice sampling + abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False) + return abs_corr_mat + + def _initial_imputation(self, X): + """Perform initial imputation for input X. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + Returns + ------- + Xt : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + mask_missing_values : ndarray, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + """ + if is_scalar_nan(self.missing_values): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + X = check_array(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + + mask_missing_values = _get_mask(X, self.missing_values) + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.transform(X) + + valid_mask = np.flatnonzero(np.logical_not( + np.isnan(self.initial_imputer_.statistics_))) + Xt = X[:, valid_mask] + mask_missing_values = mask_missing_values[:, valid_mask] + + return Xt, X_filled, mask_missing_values + + def fit_transform(self, X, y=None): + """Fits the imputer on X and return the transformed X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + self.random_state_ = getattr(self, "random_state_", + check_random_state(self.random_state)) + + if self.max_iter < 0: + raise ValueError( + "'max_iter' should be a positive integer. Got {} instead." + .format(self.max_iter)) + + if self.tol < 0: + raise ValueError( + "'tol' should be a non-negative float. Got {} instead." + .format(self.tol) + ) + + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + X_trans_indicator = self.indicator_.fit_transform(X) + else: + self.indicator_ = None + + if self.estimator is None: + from ..linear_model import BayesianRidge + self._estimator = BayesianRidge() + else: + self._estimator = clone(self.estimator) + + self.imputation_sequence_ = [] + + if hasattr(self._estimator, 'random_state'): + self._estimator.random_state = self.random_state_ + + self._min_value = -np.inf if self.min_value is None else self.min_value + self._max_value = np.inf if self.max_value is None else self.max_value + + self.initial_imputer_ = None + X, Xt, mask_missing_values = self._initial_imputation(X) + + if self.max_iter == 0 or np.all(mask_missing_values): + self.n_iter_ = 0 + return Xt + + # order in which to impute + # note this is probably too slow for large feature data (d > 100000) + # and a better way would be good. + # see: https://goo.gl/KyCNwj and subsequent comments + ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) + + abs_corr_mat = self._get_abs_corr_mat(Xt) + + n_samples, n_features = Xt.shape + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + if not self.sample_posterior: + Xt_previous = Xt.copy() + normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) + for self.n_iter_ in range(1, self.max_iter + 1): + if self.imputation_order == 'random': + ordered_idx = self._get_ordered_idx(mask_missing_values) + + for feat_idx in ordered_idx: + neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, + feat_idx, + abs_corr_mat) + Xt, estimator = self._impute_one_feature( + Xt, mask_missing_values, feat_idx, neighbor_feat_idx, + estimator=None, fit_mode=True) + estimator_triplet = _ImputerTriplet(feat_idx, + neighbor_feat_idx, + estimator) + self.imputation_sequence_.append(estimator_triplet) + + if self.verbose > 1: + print('[IterativeImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (self.n_iter_, self.max_iter, time() - start_t)) + + if not self.sample_posterior: + inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, + axis=None) + if inf_norm < normalized_tol: + if self.verbose > 0: + print('[IterativeImputer] Early stopping criterion ' + 'reached.') + break + Xt_previous = Xt.copy() + else: + if not self.sample_posterior: + warnings.warn("[IterativeImputer] Early stopping criterion not" + " reached.", ConvergenceWarning) + Xt[~mask_missing_values] = X[~mask_missing_values] + + if self.add_indicator: + Xt = np.hstack((Xt, X_trans_indicator)) + return Xt + + def transform(self, X): + """Imputes all missing values in X. + + Note that this is stochastic, and that if random_state is not fixed, + repeated calls, or permuted input, will yield different results. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + check_is_fitted(self, 'initial_imputer_') + + if self.add_indicator: + X_trans_indicator = self.indicator_.transform(X) + + X, Xt, mask_missing_values = self._initial_imputation(X) + + if self.n_iter_ == 0 or np.all(mask_missing_values): + return Xt + + imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ + i_rnd = 0 + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + for it, estimator_triplet in enumerate(self.imputation_sequence_): + Xt, _ = self._impute_one_feature( + Xt, + mask_missing_values, + estimator_triplet.feat_idx, + estimator_triplet.neighbor_feat_idx, + estimator=estimator_triplet.estimator, + fit_mode=False + ) + if not (it + 1) % imputations_per_round: + if self.verbose > 1: + print('[IterativeImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (i_rnd + 1, self.n_iter_, time() - start_t)) + i_rnd += 1 + + Xt[~mask_missing_values] = X[~mask_missing_values] + + if self.add_indicator: + Xt = np.hstack((Xt, X_trans_indicator)) + return Xt + + def fit(self, X, y=None): + """Fits the imputer on X and return self. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored + + Returns + ------- + self : object + Returns self. + """ + self.fit_transform(X) + return self + + def _more_tags(self): + return {'allow_nan': True} diff --git a/sklearn/impute/tests/__init__.py b/sklearn/impute/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/tests/test_impute.py b/sklearn/impute/tests/test_impute.py similarity index 99% rename from sklearn/tests/test_impute.py rename to sklearn/impute/tests/test_impute.py index 979140ba246cf..1552031ff2193 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -13,6 +13,9 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal +# make IterativeImputer available +from sklearn.experimental import enable_iterative_imputer # noqa + from sklearn.impute import MissingIndicator from sklearn.impute import SimpleImputer, IterativeImputer from sklearn.dummy import DummyRegressor diff --git a/sklearn/setup.py b/sklearn/setup.py index e6f10cad77d9f..5a377043e9e38 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('feature_selection/tests') config.add_subpackage('gaussian_process') config.add_subpackage('gaussian_process/tests') + config.add_subpackage('impute') + config.add_subpackage('impute/tests') config.add_subpackage('inspection') config.add_subpackage('inspection/tests') config.add_subpackage('mixture')