diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index eee30948e8db4..7516b2af9ec82 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -16,7 +16,7 @@ from ..utils import deprecated from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted, FLOAT_DTYPES -from .label import LabelEncoder +from .label import _encode, _encode_check_unknown range = six.moves.range @@ -104,32 +104,30 @@ def _fit(self, X, handle_unknown='error'): n_samples, n_features = X.shape if self._categories != 'auto': - for cats in self._categories: - if not np.all(np.sort(cats) == np.array(cats)): - raise ValueError("Unsorted categories are not yet " - "supported") + if X.dtype != object: + for cats in self._categories: + if not np.all(np.sort(cats) == np.array(cats)): + raise ValueError("Unsorted categories are not " + "supported for numerical categories") if len(self._categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") - self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] + self.categories_ = [] for i in range(n_features): - le = self._label_encoders_[i] Xi = X[:, i] if self._categories == 'auto': - le.fit(Xi) + cats = _encode(Xi) else: - if handle_unknown == 'error': - valid_mask = np.in1d(Xi, self._categories[i]) - if not np.all(valid_mask): - diff = np.unique(Xi[~valid_mask]) + cats = np.array(self._categories[i], dtype=X.dtype) + if self.handle_unknown == 'error': + diff = _encode_check_unknown(Xi, cats) + if diff: msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) - le.classes_ = np.array(self._categories[i], dtype=X.dtype) - - self.categories_ = [le.classes_ for le in self._label_encoders_] + self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): @@ -145,11 +143,11 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X[:, i] - valid_mask = np.in1d(Xi, self.categories_[i]) + diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], + return_mask=True) if not np.all(valid_mask): if handle_unknown == 'error': - diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) @@ -160,7 +158,8 @@ def _transform(self, X, handle_unknown='error'): X_mask[:, i] = valid_mask Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] - X_int[:, i] = self._label_encoders_[i].transform(Xi) + _, encoded = _encode(Xi, self.categories_[i], encode=True) + X_int[:, i] = encoded return X_int, X_mask @@ -195,8 +194,9 @@ class OneHotEncoder(_BaseEncoder): - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith - column. The passed categories must be sorted and should not mix - strings and numeric values. + column. The passed categories should not mix strings and numeric + values within a single feature, and should be sorted in case of + numeric values. The used categories can be found in the ``categories_`` attribute. @@ -713,8 +713,8 @@ class OrdinalEncoder(_BaseEncoder): - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith - column. The passed categories must be sorted and should not mix - strings and numeric values. + column. The passed categories should not mix strings and numeric + values, and should be sorted in case of numeric values. The used categories can be found in the ``categories_`` attribute. diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 043067fa37a8c..51faccf1a30a1 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -37,6 +37,129 @@ ] +def _encode_numpy(values, uniques=None, encode=False): + # only used in _encode below, see docstring there for details + if uniques is None: + if encode: + uniques, encoded = np.unique(values, return_inverse=True) + return uniques, encoded + else: + # unique sorts + return np.unique(values) + if encode: + diff = _encode_check_unknown(values, uniques) + if diff: + raise ValueError("y contains previously unseen labels: %s" + % str(diff)) + encoded = np.searchsorted(uniques, values) + return uniques, encoded + else: + return uniques + + +def _encode_python(values, uniques=None, encode=False): + # only used in _encode below, see docstring there for details + if uniques is None: + uniques = sorted(set(values)) + uniques = np.array(uniques, dtype=values.dtype) + if encode: + table = {val: i for i, val in enumerate(uniques)} + try: + encoded = np.array([table[v] for v in values]) + except KeyError as e: + raise ValueError("y contains previously unseen labels: %s" + % str(e)) + return uniques, encoded + else: + return uniques + + +def _encode(values, uniques=None, encode=False): + """Helper function to factorize (find uniques) and encode values. + + Uses pure python method for object dtype, and numpy method for + all other dtypes. + The numpy method has the limitation that the `uniques` need to + be sorted. Importantly, this is not checked but assumed to already be + the case. The calling method needs to ensure this for all non-object + values. + + Parameters + ---------- + values : array + Values to factorize or encode. + uniques : array, optional + If passed, uniques are not determined from passed values (this + can be because the user specified categories, or because they + already have been determined in fit). + encode : bool, default False + If True, also encode the values into integer codes based on `uniques`. + + Returns + ------- + uniques + If ``encode=False``. The unique values are sorted if the `uniques` + parameter was None (and thus inferred from the data). + (uniques, encoded) + If ``encode=True``. + + """ + if values.dtype == object: + return _encode_python(values, uniques, encode) + else: + return _encode_numpy(values, uniques, encode) + + +def _encode_check_unknown(values, uniques, return_mask=False): + """ + Helper function to check for unknowns in values to be encoded. + + Uses pure python method for object dtype, and numpy method for + all other dtypes. + + Parameters + ---------- + values : array + Values to check for unknowns. + uniques : array + Allowed uniques values. + return_mask : bool, default False + If True, return a mask of the same shape as `values` indicating + the valid values. + + Returns + ------- + diff : list + The unique values present in `values` and not in `uniques` (the + unknown values). + valid_mask : boolean array + Additionally returned if ``return_mask=True``. + + """ + if values.dtype == object: + uniques_set = set(uniques) + diff = list(set(values) - uniques_set) + if return_mask: + if diff: + valid_mask = np.array([val in uniques_set for val in values]) + else: + valid_mask = np.ones(len(values), dtype=bool) + return diff, valid_mask + else: + return diff + else: + unique_values = np.unique(values) + diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) + if return_mask: + if diff: + valid_mask = np.in1d(values, uniques) + else: + valid_mask = np.ones(len(values), dtype=bool) + return diff, valid_mask + else: + return diff + + class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. @@ -94,7 +217,7 @@ def fit(self, y): self : returns an instance of self. """ y = column_or_1d(y, warn=True) - self.classes_ = np.unique(y) + self.classes_ = _encode(y) return self def fit_transform(self, y): @@ -110,7 +233,7 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ y = column_or_1d(y, warn=True) - self.classes_, y = np.unique(y, return_inverse=True) + self.classes_, y = _encode(y, encode=True) return y def transform(self, y): @@ -131,12 +254,8 @@ def transform(self, y): if _num_samples(y) == 0: return np.array([]) - classes = np.unique(y) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) - raise ValueError( - "y contains previously unseen labels: %s" % str(diff)) - return np.searchsorted(self.classes_, y) + _, y = _encode(y, uniques=self.classes_, encode=True) + return y def inverse_transform(self, y): """Transform labels back to original encoding. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index d5acd110e2865..d4f8aaefc34af 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -339,10 +339,10 @@ def test_one_hot_encoder_set_params(): def check_categorical_onehot(X): - enc = OneHotEncoder() + enc = OneHotEncoder(categories='auto') Xtr1 = enc.fit_transform(X) - enc = OneHotEncoder(sparse=False) + enc = OneHotEncoder(categories='auto', sparse=False) Xtr2 = enc.fit_transform(X) assert_allclose(Xtr1.toarray(), Xtr2) @@ -351,17 +351,20 @@ def check_categorical_onehot(X): return Xtr1.toarray() -def test_one_hot_encoder(): - X = [['abc', 1, 55], ['def', 2, 55]] - +@pytest.mark.parametrize("X", [ + [['def', 1, 55], ['abc', 2, 55]], + np.array([[10, 1, 55], [5, 2, 55]]), + np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object) + ], ids=['mixed', 'numeric', 'object']) +def test_one_hot_encoder(X): Xtr = check_categorical_onehot(np.array(X)[:, [0]]) - assert_allclose(Xtr, [[1, 0], [0, 1]]) + assert_allclose(Xtr, [[0, 1], [1, 0]]) Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) - assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) + assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) - Xtr = OneHotEncoder().fit_transform(X) - assert_allclose(Xtr.toarray(), [[1, 0, 1, 0, 1], [0, 1, 0, 1, 1]]) + Xtr = OneHotEncoder(categories='auto').fit_transform(X) + assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) def test_one_hot_encoder_inverse(): @@ -449,7 +452,8 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype): # when specifying categories manually, unknown categories should already # raise when fitting enc = OneHotEncoder(categories=cats) - assert_raises(ValueError, enc.fit, X2) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.fit(X2) enc = OneHotEncoder(categories=cats, handle_unknown='ignore') exp = np.array([[1., 0., 0.], [0., 0., 0.]]) assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp) @@ -458,10 +462,20 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype): def test_one_hot_encoder_unsorted_categories(): X = np.array([['a', 'b']], dtype=object).T - # unsorted passed categories raises for now - enc = OneHotEncoder(categories=[['c', 'b', 'a']]) - msg = re.escape('Unsorted categories are not yet supported') - assert_raises_regex(ValueError, msg, enc.fit_transform, X) + enc = OneHotEncoder(categories=[['b', 'a', 'c']]) + exp = np.array([[0., 1., 0.], + [1., 0., 0.]]) + assert_array_equal(enc.fit(X).transform(X).toarray(), exp) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories_[0].tolist() == ['b', 'a', 'c'] + assert np.issubdtype(enc.categories_[0].dtype, np.object_) + + # unsorted passed categories still raise for numerical values + X = np.array([[1, 2]]).T + enc = OneHotEncoder(categories=[[2, 1, 3]]) + msg = 'Unsorted categories are not supported' + with pytest.raises(ValueError, match=msg): + enc.fit_transform(X) def test_one_hot_encoder_specified_categories_mixed_columns(): @@ -487,9 +501,12 @@ def test_one_hot_encoder_pandas(): assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) -def test_ordinal_encoder(): - X = [['abc', 2, 55], ['def', 1, 55]] - +@pytest.mark.parametrize("X", [ + [['abc', 2, 55], ['def', 1, 55]], + np.array([[10, 2, 55], [20, 1, 55]]), + np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object) + ], ids=['mixed', 'numeric', 'object']) +def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index faa0cc3ce275b..f8f4ee4870acf 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -1,5 +1,7 @@ import numpy as np +import pytest + from scipy.sparse import issparse from scipy.sparse import coo_matrix from scipy.sparse import csc_matrix @@ -24,6 +26,7 @@ from sklearn.preprocessing.label import _inverse_binarize_thresholding from sklearn.preprocessing.label import _inverse_binarize_multiclass +from sklearn.preprocessing.label import _encode from sklearn import datasets @@ -169,8 +172,33 @@ def test_label_binarizer_errors(): [1, 2, 3]) -def test_label_encoder(): - # Test LabelEncoder's transform and inverse_transform methods +@pytest.mark.parametrize( + "values, classes, unknown", + [(np.array([2, 1, 3, 1, 3], dtype='int64'), + np.array([1, 2, 3], dtype='int64'), np.array([4], dtype='int64')), + (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object), + np.array(['a', 'b', 'c'], dtype=object), + np.array(['d'], dtype=object)), + (np.array(['b', 'a', 'c', 'a', 'c']), + np.array(['a', 'b', 'c']), np.array(['d']))], + ids=['int64', 'object', 'str']) +def test_label_encoder(values, classes, unknown): + # Test LabelEncoder's transform, fit_transform and + # inverse_transform methods + le = LabelEncoder() + le.fit(values) + assert_array_equal(le.classes_, classes) + assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) + assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) + le = LabelEncoder() + ret = le.fit_transform(values) + assert_array_equal(ret, [1, 0, 2, 0, 2]) + + with pytest.raises(ValueError, match="unseen labels"): + le.transform(unknown) + + +def test_label_encoder_negative_ints(): le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) @@ -180,20 +208,13 @@ def test_label_encoder(): [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6]) - le.fit(["apple", "orange"]) - msg = "bad input shape" - assert_raise_message(ValueError, msg, le.transform, "apple") - - -def test_label_encoder_fit_transform(): - # Test fit_transform - le = LabelEncoder() - ret = le.fit_transform([1, 1, 4, 5, -1, 0]) - assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) +@pytest.mark.parametrize("dtype", ['str', 'object']) +def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() - ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) - assert_array_equal(ret, [1, 1, 2, 0]) + le.fit(np.array(["apple", "orange"], dtype=dtype)) + msg = "bad input shape" + assert_raise_message(ValueError, msg, le.transform, "apple") def test_label_encoder_errors(): @@ -214,9 +235,15 @@ def test_label_encoder_errors(): assert_raise_message(ValueError, msg, le.inverse_transform, "") -def test_label_encoder_empty_array(): +@pytest.mark.parametrize( + "values", + [np.array([2, 1, 3, 1, 3], dtype='int64'), + np.array(['b', 'a', 'c', 'a', 'c'], dtype=object), + np.array(['b', 'a', 'c', 'a', 'c'])], + ids=['int64', 'object', 'str']) +def test_label_encoder_empty_array(values): le = LabelEncoder() - le.fit(np.array(["1", "2", "1", "2", "2"])) + le.fit(values) # test empty transform transformed = le.transform([]) assert_array_equal(np.array([]), transformed) @@ -536,3 +563,22 @@ def test_inverse_binarize_multiclass(): [0, 0, 0]]), np.arange(3)) assert_array_equal(got, np.array([1, 1, 0])) + + +@pytest.mark.parametrize( + "values, expected", + [(np.array([2, 1, 3, 1, 3], dtype='int64'), + np.array([1, 2, 3], dtype='int64')), + (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object), + np.array(['a', 'b', 'c'], dtype=object)), + (np.array(['b', 'a', 'c', 'a', 'c']), + np.array(['a', 'b', 'c']))], + ids=['int64', 'object', 'str']) +def test_encode_util(values, expected): + uniques = _encode(values) + assert_array_equal(uniques, expected) + uniques, encoded = _encode(values, encode=True) + assert_array_equal(uniques, expected) + assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) + _, encoded = _encode(values, uniques, encode=True) + assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))