diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f7cffa1e663b5..56056291b4dc5 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -33,7 +33,7 @@ ] -def _encode_numpy(values, uniques=None, encode=False): +def _encode_numpy(values, uniques=None, encode=False, impute_value=None): # only used in _encode below, see docstring there for details if uniques is None: if encode: @@ -43,8 +43,10 @@ def _encode_numpy(values, uniques=None, encode=False): # unique sorts return np.unique(values) if encode: - diff = _encode_check_unknown(values, uniques) - if diff: + diff, mask = _encode_check_unknown(values, uniques, return_mask=True) + if impute_value: + values[~mask] = impute_value + elif diff: raise ValueError("y contains previously unseen labels: %s" % str(diff)) encoded = np.searchsorted(uniques, values) @@ -53,24 +55,28 @@ def _encode_numpy(values, uniques=None, encode=False): return uniques -def _encode_python(values, uniques=None, encode=False): +def _encode_python(values, uniques=None, encode=False, impute_value=None): # only used in _encode below, see docstring there for details if uniques is None: uniques = sorted(set(values)) uniques = np.array(uniques, dtype=values.dtype) if encode: table = {val: i for i, val in enumerate(uniques)} - try: - encoded = np.array([table[v] for v in values]) - except KeyError as e: - raise ValueError("y contains previously unseen labels: %s" - % str(e)) + if impute_value: + encoded = np.array([table[v] if v in table else table[impute_value] + for v in values]) + else: + try: + encoded = np.array([table[v] for v in values]) + except KeyError as e: + raise ValueError("y contains previously unseen labels: %s" + % str(e)) return uniques, encoded else: return uniques -def _encode(values, uniques=None, encode=False): +def _encode(values, uniques=None, encode=False, impute_value=None): """Helper function to factorize (find uniques) and encode values. Uses pure python method for object dtype, and numpy method for @@ -90,6 +96,9 @@ def _encode(values, uniques=None, encode=False): already have been determined in fit). encode : bool, default False If True, also encode the values into integer codes based on `uniques`. + impute_value: str, int or float, optional + If passed, never seen values will be replaced by this value during the + encoding process. Returns ------- @@ -102,12 +111,12 @@ def _encode(values, uniques=None, encode=False): """ if values.dtype == object: try: - res = _encode_python(values, uniques, encode) + res = _encode_python(values, uniques, encode, impute_value) except TypeError: raise TypeError("argument must be a string or number") return res else: - return _encode_numpy(values, uniques, encode) + return _encode_numpy(values, uniques, encode, impute_value) def _encode_check_unknown(values, uniques, return_mask=False): @@ -165,6 +174,13 @@ class LabelEncoder(BaseEstimator, TransformerMixin): Read more in the :ref:`User Guide `. + Parameters + ---------- + impute_method : str (default: None) + Method to use for imputation of unseen labels during transform. By + default no imputation is done and a ValueError will be raised if any + previously unseen labels are found. + Attributes ---------- classes_ : array of shape (n_class,) @@ -177,7 +193,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(impute_method=None) >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -190,7 +206,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(impute_method=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS @@ -204,6 +220,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin): using a one-hot or ordinal encoding scheme. """ + def __init__(self, impute_method=None): + self.impute_method = impute_method + return super().__init__() + def fit(self, y): """Fit label encoder @@ -217,7 +237,9 @@ def fit(self, y): self : returns an instance of self. """ y = column_or_1d(y, warn=True) - self.classes_ = _encode(y) + self.classes_, y = _encode(y, encode=True) + if self.impute_method: + self.impute_value = self._calculate_impute_value(y) return self def fit_transform(self, y): @@ -234,6 +256,8 @@ def fit_transform(self, y): """ y = column_or_1d(y, warn=True) self.classes_, y = _encode(y, encode=True) + if self.impute_method: + self.impute_value = self._calculate_impute_value(y) return y def transform(self, y): @@ -253,8 +277,9 @@ def transform(self, y): # transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) - - _, y = _encode(y, uniques=self.classes_, encode=True) + impute_value = self.impute_value if self.impute_method else None + _, y = _encode(y, uniques=self.classes_, + encode=True, impute_value=impute_value) return y def inverse_transform(self, y): @@ -278,13 +303,33 @@ def inverse_transform(self, y): diff = np.setdiff1d(y, np.arange(len(self.classes_))) if len(diff): raise ValueError( - "y contains previously unseen labels: %s" % str(diff)) + "y contains previously unseen labels: %s" % str(diff) + ) y = np.asarray(y) return self.classes_[y] def _more_tags(self): return {'X_types': ['1dlabels']} + def _calculate_impute_value(self, y): + """Calculates the value to be imputed to unseen labels. + + Parameters + ---------- + y : numpy array of shape [n_samples] + Target values. + + Returns + ------- + impute_value : str, int or float + """ + if self.impute_method == 'most_common': + values, counts = np.unique(y, return_counts=True) + impute_value = self.classes_[values[np.argmax(counts)]] + else: + raise ValueError('impute_method can only be "most_common" or None') + return impute_value + class LabelBinarizer(BaseEstimator, TransformerMixin): """Binarize labels in a one-vs-all fashion diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 8a7db601686a8..79b91022f8ada 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -196,6 +196,38 @@ def test_label_encoder(values, classes, unknown): with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown) + with pytest.raises(ValueError, match="impute_method"): + LabelEncoder(impute_method='test').fit(values) + + with pytest.raises(ValueError, match="impute_method"): + LabelEncoder(impute_method='test').fit_transform(values) + + +@pytest.mark.parametrize( + "values, classes, unknown", + [(np.array([2, 1, 3, 1, 3, 3], dtype='int64'), + np.array([1, 2, 3], dtype='int64'), + np.array([4], dtype='int64')), + (np.array(['b', 'a', 'c', 'a', 'c', 'c'], dtype=object), + np.array(['a', 'b', 'c'], dtype=object), + np.array(['d'], dtype=object)), + (np.array(['b', 'a', 'c', 'a', 'c', 'c']), + np.array(['a', 'b', 'c']), + np.array(['d']))], + ids=['int64', 'object', 'str']) +def test_label_encoder_impute_most_common(values, classes, unknown): + # Test LabelEncoder's transform, fit_transform and + # inverse_transform methods + le = LabelEncoder(impute_method='most_common') + le.fit(values) + assert_array_equal(le.classes_, classes) + assert_array_equal(le.transform(values), [1, 0, 2, 0, 2, 2]) + assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2, 2]), values) + assert_array_equal(le.transform(unknown), [2]) + ret = le.fit_transform(values) + assert_array_equal(ret, [1, 0, 2, 0, 2, 2]) + assert_array_equal(le.transform(unknown), [2]) + def test_label_encoder_negative_ints(): le = LabelEncoder()