From d28d43410574e3602000aad8a32d4db64e04dc2f Mon Sep 17 00:00:00 2001 From: Charles Moyes Date: Sat, 23 Nov 2019 20:25:58 -0800 Subject: [PATCH 1/3] Add MultiLabelHistogram --- sklearn/preprocessing/_label.py | 228 ++++++++++++++++++++++ sklearn/preprocessing/tests/test_label.py | 15 ++ 2 files changed, 243 insertions(+) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index c644aa919f5cf..48eacfaa833ae 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -4,6 +4,7 @@ # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi +# Charles Moyes # License: BSD 3 clause from collections import defaultdict @@ -30,6 +31,7 @@ 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer', + 'MultiLabelHistogram', ] @@ -1026,3 +1028,229 @@ def inverse_transform(self, yt): def _more_tags(self): return {'X_types': ['2dlabels']} + + +class MultiLabelHistogram(TransformerMixin, BaseEstimator): + """Transform between iterable of dicts and a multilabel format + + This transformer converts between a class mapping with numerical values + (such as a histogram) to the multilabel format: a (samples x classes) + histogram matrix indicating the presence of a class label with an + associated value. + + Parameters + ---------- + classes : array-like of shape [n_classes] (optional) + Indicates an ordering for the class labels. + All entries should be unique (cannot contain duplicate classes). + + sparse_output : boolean (default: False), + Set to true if output binary array is desired in CSR sparse format + + Attributes + ---------- + classes_ : array of labels + A copy of the `classes` parameter where provided, + or otherwise, the sorted set of classes found when fitting. + + Examples + -------- + >>> from sklearn.preprocessing import MultiLabelHistogram + >>> mlh = MultiLabelHistogram() + >>> mlh.fit_transform([{1: 5.5, 2: -3.0}, {3: 999}]) + array([[ 5.5 -3. 0. ], + [ 0. 0. 999. ]]) + >>> mlh.classes_ + array([1, 2, 3]) + + >>> mlh.fit_transform([{'sci-fi': -2.0, 'thriller': 5.0}, {'comedy': 0.1}]) + array([[ 0. -2. 5. ], + [ 0.1 0. 0. ]]) + >>> list(mlh.classes_) + ['comedy', 'sci-fi', 'thriller'] + + See also + -------- + MultiLabelBinarizer : Transform between iterable + of iterables and a multilabel format + """ + + def __init__(self, classes=None, sparse_output=False): + self.classes = classes + self.sparse_output = sparse_output + + def fit(self, y): + """Fit the label sets binarizer, storing :term:`classes_` + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + self : returns this MultiLabelHistogram instance + """ + self._cached_dict = None + if self.classes is None: + classes = sorted(set(itertools.chain.from_iterable(y))) + elif len(set(self.classes)) < len(self.classes): + raise ValueError("The classes argument contains duplicate " + "classes. Remove these duplicates before passing " + "them to MultiLabelHistogram.") + else: + classes = self.classes + dtype = np.int if all(isinstance(c, int) for c in classes) else object + self.classes_ = np.empty(len(classes), dtype=dtype) + self.classes_[:] = classes + return self + + def fit_transform(self, y): + """Fit the label sets binarizer and transform the given label sets + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + y_indicator : array or CSR matrix, shape (n_samples, n_classes) + A matrix such that `y_indicator[i, j] = (C != 0)` iff + `classes_[j]` is in `y[i]` with associated value C, and 0 + otherwise. + """ + self._cached_dict = None + + if self.classes is not None: + return self.fit(y).transform(y) + + # Automatically increment on new class + class_mapping = defaultdict(int) + class_mapping.default_factory = class_mapping.__len__ + yt = self._transform(y, class_mapping) + + # sort classes and reorder columns + tmp = sorted(class_mapping, key=class_mapping.get) + + # (make safe for tuples) + dtype = np.int if all(isinstance(c, int) for c in tmp) else object + class_mapping = np.empty(len(tmp), dtype=dtype) + class_mapping[:] = tmp + self.classes_, inverse = np.unique(class_mapping, return_inverse=True) + # ensure yt.indices keeps its current dtype + yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, + copy=False) + + if not self.sparse_output: + yt = yt.toarray() + + return yt + + def transform(self, y): + """Transform the given label sets + + Parameters + ---------- + y : iterable of dicts + A dict of labels for each sample, containing associated + values for each class. If the `classes` parameter is set, + `y` will not be iterated. + + Returns + ------- + y_indicator : array or CSR matrix, shape (n_samples, n_classes) + A matrix such that `y_indicator[i, j] = (C ! = 0)` iff + `classes_[j]` is in `y[i]` with associated value C, and 0 + otherwise. + """ + check_is_fitted(self) + + class_to_index = self._build_cache() + yt = self._transform(y, class_to_index) + + if not self.sparse_output: + yt = yt.toarray() + + return yt + + def _build_cache(self): + if self._cached_dict is None: + self._cached_dict = dict(zip(self.classes_, + range(len(self.classes_)))) + + return self._cached_dict + + def _transform(self, y, class_mapping): + """Transforms the label sets with a given mapping + + Parameters + ---------- + y : iterable of iterables + class_mapping : Mapping + Maps from label to column index in label value matrix + + Returns + ------- + y_indicator : sparse CSR matrix, shape (n_samples, n_classes) + Label indicator matrix + """ + data = array.array('f') + indices = array.array('i') + indptr = array.array('i', [0]) + unknown = set() + for labels in y: + index = set() + for label, value in labels.items(): + try: + index.add(class_mapping[label]) + except KeyError: + unknown.add(label) + + data.append(value) + + indices.extend(index) + indptr.append(len(indices)) + if unknown: + warnings.warn('unknown class(es) {0} will be ignored' + .format(sorted(unknown, key=str))) + + return sp.csr_matrix((data, indices, indptr), + shape=(len(indptr) - 1, len(class_mapping))) + + def inverse_transform(self, yt): + """Transform the given indicator matrix into label sets + + Parameters + ---------- + yt : array or sparse matrix of shape (n_samples, n_classes) + A matrix containing associated values for each class in each + sample. + + Returns + ------- + y : list of tuples + The set of labels for each sample such that `y[i]` consists of + `classes_[j]` for each `yt[i, j] == (C != 0)`. + """ + check_is_fitted(self) + + if yt.shape[1] != len(self.classes_): + raise ValueError('Expected indicator for {0} classes, but got {1}' + .format(len(self.classes_), yt.shape[1])) + + if sp.issparse(yt): + yt = yt.tocsr() + return [dict(zip(self.classes_.take(yt.indices[start:end]), yt.data[start:end])) + for start, end in zip(yt.indptr[:-1], yt.indptr[1:])] + else: + print([ind for ind in yt]) + return [dict(zip(self.classes_.compress(indicators), filter(None, indicators))) + for indicators in yt] + + def _more_tags(self): + return {'X_types': ['2dlabels']} diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 69c025fcc76e3..ee650c37d8b66 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -17,6 +17,7 @@ from sklearn.preprocessing._label import LabelBinarizer from sklearn.preprocessing._label import MultiLabelBinarizer +from sklearn.preprocessing._label import MultiLabelHistogram from sklearn.preprocessing._label import LabelEncoder from sklearn.preprocessing._label import label_binarize @@ -654,3 +655,17 @@ def test_encode_check_unknown(): with pytest.raises(ValueError, match='y contains previously unseen labels'): _encode(values, uniques, encode=True, check_unknown=False) + + +def test_multilabel_histogram(): + mlh = MultiLabelHistogram() + y = [{1: 5.5, 2: -3.0}, {3: 999}] + Y = np.array([[ 5.5 -3. 0. ], + [ 0. 0. 999. ]]) + assert_array_equal(mlh.fit_transform(y), Y) + + mlh = MultiLabelHistogram() + y = [{'sci-fi': -2.0, 'thriller': 5.0}, {'comedy': 0.1}] + Y = np.array([[ 0. -2. 5. ], + [ 0.1 0. 0. ]]) + assert_array_equal(mlh.fit_transform(y), Y) From 2cc1faef05a5d32a3aecdcf8551687f4e5edc72a Mon Sep 17 00:00:00 2001 From: Charles Moyes Date: Sat, 23 Nov 2019 20:57:22 -0800 Subject: [PATCH 2/3] use approx floating point comparison in test --- sklearn/preprocessing/tests/test_label.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index ee650c37d8b66..6363e394c0a7f 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -11,6 +11,7 @@ from sklearn.utils.multiclass import type_of_target +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings @@ -662,10 +663,10 @@ def test_multilabel_histogram(): y = [{1: 5.5, 2: -3.0}, {3: 999}] Y = np.array([[ 5.5 -3. 0. ], [ 0. 0. 999. ]]) - assert_array_equal(mlh.fit_transform(y), Y) + assert_allclose(mlh.fit_transform(y), Y) mlh = MultiLabelHistogram() y = [{'sci-fi': -2.0, 'thriller': 5.0}, {'comedy': 0.1}] Y = np.array([[ 0. -2. 5. ], [ 0.1 0. 0. ]]) - assert_array_equal(mlh.fit_transform(y), Y) + assert_allclose(mlh.fit_transform(y), Y) From 47f61b193bf95a53e858d0c4269776a8e249e167 Mon Sep 17 00:00:00 2001 From: Charles Moyes Date: Sat, 23 Nov 2019 20:58:48 -0800 Subject: [PATCH 3/3] format test code --- sklearn/preprocessing/tests/test_label.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 6363e394c0a7f..05766a627e361 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -661,12 +661,12 @@ def test_encode_check_unknown(): def test_multilabel_histogram(): mlh = MultiLabelHistogram() y = [{1: 5.5, 2: -3.0}, {3: 999}] - Y = np.array([[ 5.5 -3. 0. ], - [ 0. 0. 999. ]]) + Y = np.array([[ 5.5, -3., 0. ], + [ 0., 0., 999. ]]) assert_allclose(mlh.fit_transform(y), Y) mlh = MultiLabelHistogram() y = [{'sci-fi': -2.0, 'thriller': 5.0}, {'comedy': 0.1}] - Y = np.array([[ 0. -2. 5. ], - [ 0.1 0. 0. ]]) + Y = np.array([[ 0., -2., 5. ], + [ 0.1, 0., 0. ]]) assert_allclose(mlh.fit_transform(y), Y)