From e40a7f56f0970cb06c3574a379452b9951542c9e Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 29 May 2013 14:14:15 +0200 Subject: [PATCH 01/16] FIX unique_labels in corner case --- sklearn/utils/multiclass.py | 69 ++++++++++++++----- sklearn/utils/tests/test_multiclass.py | 95 +++++++++++++++++++++++++- 2 files changed, 145 insertions(+), 19 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index be79114e3dfda..cc6106eab0074 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -14,16 +14,13 @@ from ..externals.six import string_types -def unique_labels(*lists_of_labels): +def unique_labels(*ys): """Extract an ordered array of unique labels Parameters ---------- - lists_of_labels : list of labels, - The supported "list of labels" are: - - a list / tuple / numpy array of int - - a list of lists / tuples of int; - - a binary indicator matrix (2D numpy array) + ys : array-likes, + Must be either in binary, multiclass or multilabel format. Returns ------- @@ -45,23 +42,59 @@ def unique_labels(*lists_of_labels): array([1, 2, 3]) """ - def _unique_labels(y): - classes = None - if is_multilabel(y): - if is_label_indicator_matrix(y): - classes = np.arange(y.shape[1]) - else: - classes = np.array(sorted(set(chain(*y)))) - else: - classes = np.unique(y) + def _unique_multiclass(y): + return np.unique(y) - return classes + def _unique_sequence_of_sequence(y): + return np.array(sorted(set(chain(*y)))) - if not lists_of_labels: + def _unique_indicator(y): + return np.arange(y.shape[1]) + + if not ys: raise ValueError('No list of labels has been passed.') - return np.unique(np.hstack(_unique_labels(y) for y in lists_of_labels)) + ys_is_multilabels = [is_multilabel(y) for y in ys] + + if len(set(ys_is_multilabels)) != 1: + raise ValueError("Mix of binary / mutliclass and multilabel type") + + if all(ys_is_multilabels): + ys_is_indicator = [is_label_indicator_matrix(y) for y in ys] + + # Mix of indicator and sequence of sequence multilabel format + if len(set(ys_is_indicator)) != 1: + labels = unique_labels(*[y for y in ys + if not is_label_indicator_matrix(y)]) + labels_idx = unique_labels(*[y for y in ys + if is_label_indicator_matrix(y)]) + if labels.size != labels_idx.size: + raise ValueError("Unable to infer mix multilabel type") + + return labels + + # Only indicator multilabel format + elif all(ys_is_indicator): + if len(set(y.shape[1] for y in ys)) > 1: + raise ValueError("Multi-label binary indicator input with " + "different number of labels") + else: + return _unique_indicator(ys[0]) + else: + # Only indicator sequence of sequence multilabel format + _unique_labels = _unique_sequence_of_sequence + + else: + _unique_labels = _unique_multiclass + + # Combine every labels + ys_labels = [_unique_labels(y) for y in ys] + + if len(set(y_labels.dtype.kind for y_labels in ys_labels)) > 1: + raise ValueError("Mix of dtype.kind, can't infered labels set") + + return np.unique(np.hstack(ys_labels)) def _is_integral_float(y): diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index e8ad3b77905d2..1842624213eb6 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -1,5 +1,5 @@ import numpy as np - +from itertools import product from sklearn.externals.six.moves import xrange from sklearn.externals.six import iteritems @@ -117,6 +117,58 @@ ] +EXAMPLES = { + 'multilabel-indicator': [ + np.random.randint(2, size=(10, 10)), + np.array([[0, 1], [1, 0]]), + np.array([[0, 0], [0, 0]]), + np.array([[-1, 1], [1, -1]]), + np.array([[-3, 3], [3, -3]]), + + # XXX : not considered as multilabel-indicator at the moment + # see is_label_indicator_matrix + # np.array([[0, 1]]), + ], + 'multilabel-sequences': [ + [[0, 1]], + [[0], [1]], + [[1, 2, 3]], + [[1, 2, 1]], # duplicate values, why not? + [[1], [2], [0, 1]], + [[1], [2]], + [[]], + [()], + np.array([[], [1, 2]], dtype='object'), + ], + 'multiclass': [ + [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], + np.array([1, 0, 2]), + np.array([[1], [0], [2]]), + [0, 1, 2], + ['a', 'b', 'c'], + ], + 'multiclass-multioutput': [ + np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), + np.array([['a', 'b'], ['c', 'd']]), + np.array([[1, 0, 2]]), + ], + 'binary': [ + [0, 1], + [1, 1], + [], + [0], + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), + np.array([[0], [1]]), + [1, -1], + [3, 5], + ['a'], + ['a', 'b'], + ['abc', 'def'], + ], + +} + + def test_unique_labels(): # Empty iterable assert_raises(ValueError, unique_labels) @@ -142,6 +194,47 @@ def test_unique_labels(): assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3)) + # Border line case with binary indicator matrix + assert_raises(ValueError, unique_labels, [4, 0, 2], np.ones((5, 5))) + assert_raises(ValueError, unique_labels, np.ones((5, 4)), np.ones((5, 5))) + assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), + np.arange(5)) + + # Some tests with strings input + assert_array_equal(unique_labels(["a", "b", "c"], ["d"]), + ["a", "b", "c", "d"]) + assert_array_equal(unique_labels([["a", "b"], ["c"]], [["d"]]), + ["a", "b", "c", "d"]) + + #Mix of multilabel-indicator and multilabel-sequences + assert_array_equal(unique_labels([["a", "b"], ["c"]], np.ones((3, 3))), + ["a", "b", "c"]) + assert_raises(ValueError, unique_labels, [["a", "b"], ["c"]], + np.ones((3, 4))) + assert_raises(ValueError, unique_labels, [["a", "b"], ["c", 'd']], + np.ones((3, 3))) + + assert_array_equal(unique_labels([[1, 2], [3]], np.ones((3, 3))), + [1, 2, 3]) + assert_raises(ValueError, unique_labels, [[1, 2], [3]], + np.ones((3, 4))) + assert_raises(ValueError, unique_labels, [[1, 2], [3, 4]], + np.ones((3, 3))) + + #Mix with binary or multiclass and multilabel + pair_multiclass_multilabel = product(EXAMPLES["multilabel-indicator"] + + EXAMPLES["multilabel-sequences"], + EXAMPLES["multiclass"] + + EXAMPLES["binary"]) + + for y_multilabel, y_multiclass in pair_multiclass_multilabel: + assert_raises(ValueError, unique_labels, y_multiclass, y_multilabel) + assert_raises(ValueError, unique_labels, y_multilabel, y_multiclass) + + # Mix input type + assert_raises(ValueError, unique_labels, [[1, 2], [3]], + [["a", "d"]]) + def test_is_multilabel(): for group, group_examples in iteritems(EXAMPLES): From e3ca5c13eb2844ff34b3e900346f408bf48062c5 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 29 May 2013 14:51:54 +0200 Subject: [PATCH 02/16] FIX issue with comparable but different dtype --- sklearn/utils/multiclass.py | 4 +++- sklearn/utils/tests/test_multiclass.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index cc6106eab0074..3c55fc24863f1 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -90,8 +90,10 @@ def _unique_indicator(y): # Combine every labels ys_labels = [_unique_labels(y) for y in ys] + labels_type_set = set(y_labels.dtype.kind for y_labels in ys_labels) - if len(set(y_labels.dtype.kind for y_labels in ys_labels)) > 1: + if (not (labels_type_set <= set(["b", "i", "u","f"])) and + not (labels_type_set <= set(["S", "a"," U"]))): raise ValueError("Mix of dtype.kind, can't infered labels set") return np.unique(np.hstack(ys_labels)) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 1842624213eb6..0932d77503e72 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -234,6 +234,7 @@ def test_unique_labels(): # Mix input type assert_raises(ValueError, unique_labels, [[1, 2], [3]], [["a", "d"]]) + assert_array_equal(unique_labels([(2,), (0, 2,)], [(), ()]), [0, 2]) def test_is_multilabel(): From adcb0a811ee0596256ebe66524d2f33e1127129a Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 3 Jun 2013 10:26:30 +0200 Subject: [PATCH 03/16] ENH don't allow mix of input multilabel format --- sklearn/utils/multiclass.py | 21 ++++++------------- sklearn/utils/tests/test_multiclass.py | 28 +++++++++----------------- 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 3c55fc24863f1..e181c9b82a2ed 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -63,27 +63,18 @@ def _unique_indicator(y): if all(ys_is_multilabels): ys_is_indicator = [is_label_indicator_matrix(y) for y in ys] - # Mix of indicator and sequence of sequence multilabel format - if len(set(ys_is_indicator)) != 1: - labels = unique_labels(*[y for y in ys - if not is_label_indicator_matrix(y)]) - labels_idx = unique_labels(*[y for y in ys - if is_label_indicator_matrix(y)]) - if labels.size != labels_idx.size: - raise ValueError("Unable to infer mix multilabel type") - - return labels - # Only indicator multilabel format - elif all(ys_is_indicator): + if all(ys_is_indicator): if len(set(y.shape[1] for y in ys)) > 1: raise ValueError("Multi-label binary indicator input with " "different number of labels") else: return _unique_indicator(ys[0]) - else: + elif all(not ys_is_indicator for ys_is_indicator in ys_is_indicator): # Only indicator sequence of sequence multilabel format _unique_labels = _unique_sequence_of_sequence + else: + raise ValueError("Mix multilabel input format") else: _unique_labels = _unique_multiclass @@ -92,8 +83,8 @@ def _unique_indicator(y): ys_labels = [_unique_labels(y) for y in ys] labels_type_set = set(y_labels.dtype.kind for y_labels in ys_labels) - if (not (labels_type_set <= set(["b", "i", "u","f"])) and - not (labels_type_set <= set(["S", "a"," U"]))): + if (not (labels_type_set <= set(["b", "i", "u", "f"])) and + not (labels_type_set <= set(["S", "a", " U"]))): raise ValueError("Mix of dtype.kind, can't infered labels set") return np.unique(np.hstack(ys_labels)) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 0932d77503e72..090f2ebc2d468 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -207,27 +207,19 @@ def test_unique_labels(): ["a", "b", "c", "d"]) #Mix of multilabel-indicator and multilabel-sequences - assert_array_equal(unique_labels([["a", "b"], ["c"]], np.ones((3, 3))), - ["a", "b", "c"]) - assert_raises(ValueError, unique_labels, [["a", "b"], ["c"]], - np.ones((3, 4))) - assert_raises(ValueError, unique_labels, [["a", "b"], ["c", 'd']], - np.ones((3, 3))) - - assert_array_equal(unique_labels([[1, 2], [3]], np.ones((3, 3))), - [1, 2, 3]) - assert_raises(ValueError, unique_labels, [[1, 2], [3]], - np.ones((3, 4))) - assert_raises(ValueError, unique_labels, [[1, 2], [3, 4]], - np.ones((3, 3))) + mix_multilabel_format = product(EXAMPLES["multilabel-indicator"], + EXAMPLES["multilabel-sequences"]) + for y_multilabel, y_multiclass in mix_multilabel_format: + assert_raises(ValueError, unique_labels, y_multiclass, y_multilabel) + assert_raises(ValueError, unique_labels, y_multilabel, y_multiclass) #Mix with binary or multiclass and multilabel - pair_multiclass_multilabel = product(EXAMPLES["multilabel-indicator"] + - EXAMPLES["multilabel-sequences"], - EXAMPLES["multiclass"] + - EXAMPLES["binary"]) + mix_clf_format = product(EXAMPLES["multilabel-indicator"] + + EXAMPLES["multilabel-sequences"], + EXAMPLES["multiclass"] + + EXAMPLES["binary"]) - for y_multilabel, y_multiclass in pair_multiclass_multilabel: + for y_multilabel, y_multiclass in mix_clf_format: assert_raises(ValueError, unique_labels, y_multiclass, y_multilabel) assert_raises(ValueError, unique_labels, y_multilabel, y_multiclass) From 825b2c0cadcb20caff389394bb1b3d15744fcab9 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 3 Jun 2013 10:29:20 +0200 Subject: [PATCH 04/16] ENH simpler check for mix of string and number input --- sklearn/utils/multiclass.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index e181c9b82a2ed..214c2ba4b0dee 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -81,11 +81,12 @@ def _unique_indicator(y): # Combine every labels ys_labels = [_unique_labels(y) for y in ys] - labels_type_set = set(y_labels.dtype.kind for y_labels in ys_labels) + y_is_string = [y_labels.dtype.type is np.string_ + for y_labels in ys_labels] - if (not (labels_type_set <= set(["b", "i", "u", "f"])) and - not (labels_type_set <= set(["S", "a", " U"]))): - raise ValueError("Mix of dtype.kind, can't infered labels set") + if len(set(y_is_string)) != 1: + raise ValueError("Mix of string and number type: " + "can't infered unique labels set") return np.unique(np.hstack(ys_labels)) From 45847a7b422d8eb37da1125d688c1b817540e8da Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 3 Jun 2013 10:47:18 +0200 Subject: [PATCH 05/16] COSMIT better name --- sklearn/utils/multiclass.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 214c2ba4b0dee..455300bcc2834 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -81,10 +81,10 @@ def _unique_indicator(y): # Combine every labels ys_labels = [_unique_labels(y) for y in ys] - y_is_string = [y_labels.dtype.type is np.string_ - for y_labels in ys_labels] + ys_is_string = [y_labels.dtype.type is np.string_ + for y_labels in ys_labels] - if len(set(y_is_string)) != 1: + if len(set(ys_is_string)) != 1: raise ValueError("Mix of string and number type: " "can't infered unique labels set") From 46d4718712c09804e9d492e1fcbbbfdcf121a24f Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 3 Jun 2013 13:06:38 +0200 Subject: [PATCH 06/16] Typo --- sklearn/utils/multiclass.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 455300bcc2834..5893a79008832 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -58,7 +58,7 @@ def _unique_indicator(y): ys_is_multilabels = [is_multilabel(y) for y in ys] if len(set(ys_is_multilabels)) != 1: - raise ValueError("Mix of binary / mutliclass and multilabel type") + raise ValueError("Mix of binary / mutliclass and multilabel types") if all(ys_is_multilabels): ys_is_indicator = [is_label_indicator_matrix(y) for y in ys] @@ -66,7 +66,7 @@ def _unique_indicator(y): # Only indicator multilabel format if all(ys_is_indicator): if len(set(y.shape[1] for y in ys)) > 1: - raise ValueError("Multi-label binary indicator input with " + raise ValueError("Multi-label binary indicator inputs with " "different number of labels") else: return _unique_indicator(ys[0]) @@ -74,7 +74,7 @@ def _unique_indicator(y): # Only indicator sequence of sequence multilabel format _unique_labels = _unique_sequence_of_sequence else: - raise ValueError("Mix multilabel input format") + raise ValueError("Mix of multilabel input format") else: _unique_labels = _unique_multiclass @@ -85,8 +85,7 @@ def _unique_indicator(y): for y_labels in ys_labels] if len(set(ys_is_string)) != 1: - raise ValueError("Mix of string and number type: " - "can't infered unique labels set") + raise ValueError("Mix of label input type s(string and number)") return np.unique(np.hstack(ys_labels)) From 91e4b9a91f84eb8e164fc5c0dd501fb99ebef473 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Sun, 23 Jun 2013 00:00:12 +0200 Subject: [PATCH 07/16] ENH use type_of_target within unique_labels --- sklearn/utils/multiclass.py | 72 ++++++++++++++------------ sklearn/utils/tests/test_multiclass.py | 59 +++------------------ 2 files changed, 48 insertions(+), 83 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 5893a79008832..17f7be780a55c 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -14,13 +14,32 @@ from ..externals.six import string_types +def _unique_multiclass(y): + return np.unique(y) + + +def _unique_sequence_of_sequence(y): + return np.array(sorted(set(chain(*y)))) + + +def _unique_indicator(y): + return np.arange(y.shape[1]) + + +_FN_UNIQUE_LABELS = { + 'binary': _unique_multiclass, + 'multiclass': _unique_multiclass, + 'multilabel-sequences': _unique_sequence_of_sequence, + 'multilabel-indicator': _unique_indicator, +} + + def unique_labels(*ys): """Extract an ordered array of unique labels Parameters ---------- ys : array-likes, - Must be either in binary, multiclass or multilabel format. Returns ------- @@ -42,42 +61,31 @@ def unique_labels(*ys): array([1, 2, 3]) """ + if not ys: + raise ValueError('No argument has been passed.') - def _unique_multiclass(y): - return np.unique(y) - - def _unique_sequence_of_sequence(y): - return np.array(sorted(set(chain(*y)))) + ys_types = [type_of_target(x) for x in ys] - def _unique_indicator(y): - return np.arange(y.shape[1]) + if len(set(ys_types)) != 1: + if set(ys_types) == set(["binary", "multiclass"]): + label_type = "multiclass" - if not ys: - raise ValueError('No list of labels has been passed.') - - ys_is_multilabels = [is_multilabel(y) for y in ys] - - if len(set(ys_is_multilabels)) != 1: - raise ValueError("Mix of binary / mutliclass and multilabel types") - - if all(ys_is_multilabels): - ys_is_indicator = [is_label_indicator_matrix(y) for y in ys] - - # Only indicator multilabel format - if all(ys_is_indicator): - if len(set(y.shape[1] for y in ys)) > 1: - raise ValueError("Multi-label binary indicator inputs with " - "different number of labels") - else: - return _unique_indicator(ys[0]) - elif all(not ys_is_indicator for ys_is_indicator in ys_is_indicator): - # Only indicator sequence of sequence multilabel format - _unique_labels = _unique_sequence_of_sequence else: - raise ValueError("Mix of multilabel input format") - + raise ValueError("Mix type of y not allowed, got type %s" + % ys_types) else: - _unique_labels = _unique_multiclass + label_type = ys_types[0] + + # Check consistency for the indicator format + if (label_type == "multilabel-indicator" and + len(set(y.shape[1] for y in ys)) > 1): + raise ValueError("Multi-label binary indicator input with " + "different numbers of labels") + + # Get the proper unique function for the given format + _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) + if not _unique_labels: + raise ValueError("Unknown label type") # Combine every labels ys_labels = [_unique_labels(y) for y in ys] diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 090f2ebc2d468..31ec927001e0a 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -117,57 +117,6 @@ ] -EXAMPLES = { - 'multilabel-indicator': [ - np.random.randint(2, size=(10, 10)), - np.array([[0, 1], [1, 0]]), - np.array([[0, 0], [0, 0]]), - np.array([[-1, 1], [1, -1]]), - np.array([[-3, 3], [3, -3]]), - - # XXX : not considered as multilabel-indicator at the moment - # see is_label_indicator_matrix - # np.array([[0, 1]]), - ], - 'multilabel-sequences': [ - [[0, 1]], - [[0], [1]], - [[1, 2, 3]], - [[1, 2, 1]], # duplicate values, why not? - [[1], [2], [0, 1]], - [[1], [2]], - [[]], - [()], - np.array([[], [1, 2]], dtype='object'), - ], - 'multiclass': [ - [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], - np.array([1, 0, 2]), - np.array([[1], [0], [2]]), - [0, 1, 2], - ['a', 'b', 'c'], - ], - 'multiclass-multioutput': [ - np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), - np.array([['a', 'b'], ['c', 'd']]), - np.array([[1, 0, 2]]), - ], - 'binary': [ - [0, 1], - [1, 1], - [], - [0], - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), - np.array([[0], [1]]), - [1, -1], - [3, 5], - ['a'], - ['a', 'b'], - ['abc', 'def'], - ], - -} - def test_unique_labels(): # Empty iterable @@ -219,6 +168,14 @@ def test_unique_labels(): EXAMPLES["multiclass"] + EXAMPLES["binary"]) + for example in NON_ARRAY_LIKE_EXAMPLES: + assert_raises(ValueError, unique_labels, example) + + for y_type in ["unknown", "continuous", 'continuous-multioutput', + 'multiclass-multioutput']: + for example in EXAMPLES[y_type]: + assert_raises(ValueError, unique_labels, example) + for y_multilabel, y_multiclass in mix_clf_format: assert_raises(ValueError, unique_labels, y_multiclass, y_multilabel) assert_raises(ValueError, unique_labels, y_multilabel, y_multiclass) From 61d9f41d58a746ba1518450127b92c7298b96663 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 24 Jun 2013 14:01:35 +0200 Subject: [PATCH 08/16] ENH improve documentation with allowed label types --- sklearn/utils/multiclass.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 17f7be780a55c..a927d9e0a15dc 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -37,6 +37,15 @@ def _unique_indicator(y): def unique_labels(*ys): """Extract an ordered array of unique labels + We don't allow: + - mix of multilabel and multiclass (single label) targets + - mix of label indicator matrix and anything else, + because there are no explicit labels) + - mix of label indicator matrices of different sizes + - mix of string and integer labels + + At the moment, we also don't allow "mutliclass-multioutput" input type. + Parameters ---------- ys : array-likes, @@ -71,7 +80,7 @@ def unique_labels(*ys): label_type = "multiclass" else: - raise ValueError("Mix type of y not allowed, got type %s" + raise ValueError("Mix type of y not allowed, got types %s" % ys_types) else: label_type = ys_types[0] @@ -89,11 +98,10 @@ def unique_labels(*ys): # Combine every labels ys_labels = [_unique_labels(y) for y in ys] - ys_is_string = [y_labels.dtype.type is np.string_ - for y_labels in ys_labels] - if len(set(ys_is_string)) != 1: - raise ValueError("Mix of label input type s(string and number)") + if (len(set(y_labels.dtype.type is np.string_ + for y_labels in ys_labels)) != 1): + raise ValueError("Mix of label input types (string and number)") return np.unique(np.hstack(ys_labels)) From aef47a751a7d1f3679db5c642038f0a2597e0460 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 24 Jun 2013 15:28:29 +0200 Subject: [PATCH 09/16] ENH check that we don't mix number and strings --- sklearn/utils/multiclass.py | 20 ++++++++++++-------- sklearn/utils/tests/test_multiclass.py | 8 +++++++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index a927d9e0a15dc..ad0fa97b5f17c 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -8,6 +8,7 @@ """ from collections import Sequence from itertools import chain +from itertools import imap import numpy as np @@ -73,8 +74,8 @@ def unique_labels(*ys): if not ys: raise ValueError('No argument has been passed.') + # Check that we don't mix label format ys_types = [type_of_target(x) for x in ys] - if len(set(ys_types)) != 1: if set(ys_types) == set(["binary", "multiclass"]): label_type = "multiclass" @@ -91,19 +92,22 @@ def unique_labels(*ys): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") + # Check that we don't mix string and number type + if ((label_type in ("binary", "multiclass") and + len(set([isinstance(x, basestring) + for y in ys for x in y])) > 1) or + (label_type == "multilabel-sequences" and + len(set.union(*[set(imap(lambda x: isinstance(x, basestring), + chain(*y))) for y in ys])) > 1)): + raise ValueError("Mix of label input types (string and number)") + # Get the proper unique function for the given format _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) if not _unique_labels: raise ValueError("Unknown label type") # Combine every labels - ys_labels = [_unique_labels(y) for y in ys] - - if (len(set(y_labels.dtype.type is np.string_ - for y_labels in ys_labels)) != 1): - raise ValueError("Mix of label input types (string and number)") - - return np.unique(np.hstack(ys_labels)) + return np.unique(np.hstack(_unique_labels(y) for y in ys)) def _is_integral_float(y): diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 31ec927001e0a..ebe4ff6bcfaed 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -180,10 +180,16 @@ def test_unique_labels(): assert_raises(ValueError, unique_labels, y_multiclass, y_multilabel) assert_raises(ValueError, unique_labels, y_multilabel, y_multiclass) - # Mix input type + # Mix string and number input type assert_raises(ValueError, unique_labels, [[1, 2], [3]], [["a", "d"]]) + assert_raises(ValueError, unique_labels, ["1", 2]) + assert_raises(ValueError, unique_labels, [["1", 2], [3]]) + assert_raises(ValueError, unique_labels, [["1", "2"], [3]]) + assert_array_equal(unique_labels([(2,), (0, 2,)], [(), ()]), [0, 2]) + assert_array_equal(unique_labels([("2",), ("0", "2",)], [(), ()]), + ["0", "2"]) def test_is_multilabel(): From 7856a72da8b4ba0e24baff56fad54b9343c804b0 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 24 Jun 2013 15:38:34 +0200 Subject: [PATCH 10/16] Flatten label type checking --- sklearn/utils/multiclass.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index ad0fa97b5f17c..3e004f20655b4 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -75,16 +75,14 @@ def unique_labels(*ys): raise ValueError('No argument has been passed.') # Check that we don't mix label format - ys_types = [type_of_target(x) for x in ys] - if len(set(ys_types)) != 1: - if set(ys_types) == set(["binary", "multiclass"]): - label_type = "multiclass" - - else: - raise ValueError("Mix type of y not allowed, got types %s" - % ys_types) - else: - label_type = ys_types[0] + ys_types = set(type_of_target(x) for x in ys) + if ys_types == set(["binary", "multiclass"]): + ys_types = set(["multiclass"]) + + if len(ys_types) > 1: + raise ValueError("Mix type of y not allowed, got types %s" % ys_types) + + label_type = ys_types.pop() # Check consistency for the indicator format if (label_type == "multilabel-indicator" and @@ -92,7 +90,7 @@ def unique_labels(*ys): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") - # Check that we don't mix string and number type + # Check that we don't mix string type with number type if ((label_type in ("binary", "multiclass") and len(set([isinstance(x, basestring) for y in ys for x in y])) > 1) or @@ -106,7 +104,7 @@ def unique_labels(*ys): if not _unique_labels: raise ValueError("Unknown label type") - # Combine every labels + # Combine labels return np.unique(np.hstack(_unique_labels(y) for y in ys)) From 33016ed7b910c42d3855efc82a5ed69ea588a150 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Mon, 24 Jun 2013 15:41:56 +0200 Subject: [PATCH 11/16] TST add smoke test for all supported format --- sklearn/utils/tests/test_multiclass.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index ebe4ff6bcfaed..2492b14cb1421 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -155,6 +155,12 @@ def test_unique_labels(): assert_array_equal(unique_labels([["a", "b"], ["c"]], [["d"]]), ["a", "b", "c", "d"]) + # Smoke test for all supported format + for format in ["binary", "multiclass", "multilabel-sequences", + "multilabel-indicator"]: + for y in EXAMPLES[format]: + unique_labels(y) + #Mix of multilabel-indicator and multilabel-sequences mix_multilabel_format = product(EXAMPLES["multilabel-indicator"], EXAMPLES["multilabel-sequences"]) From 6c68cac70a8577eb073cc32c043dc73ab58bdd15 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 25 Jun 2013 11:58:30 +0200 Subject: [PATCH 12/16] COSMIT --- sklearn/utils/tests/test_multiclass.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 2492b14cb1421..3cd1f1f197171 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -161,6 +161,15 @@ def test_unique_labels(): for y in EXAMPLES[format]: unique_labels(y) + # We don't support those format at the moment + for example in NON_ARRAY_LIKE_EXAMPLES: + assert_raises(ValueError, unique_labels, example) + + for y_type in ["unknown", "continuous", 'continuous-multioutput', + 'multiclass-multioutput']: + for example in EXAMPLES[y_type]: + assert_raises(ValueError, unique_labels, example) + #Mix of multilabel-indicator and multilabel-sequences mix_multilabel_format = product(EXAMPLES["multilabel-indicator"], EXAMPLES["multilabel-sequences"]) @@ -174,14 +183,6 @@ def test_unique_labels(): EXAMPLES["multiclass"] + EXAMPLES["binary"]) - for example in NON_ARRAY_LIKE_EXAMPLES: - assert_raises(ValueError, unique_labels, example) - - for y_type in ["unknown", "continuous", 'continuous-multioutput', - 'multiclass-multioutput']: - for example in EXAMPLES[y_type]: - assert_raises(ValueError, unique_labels, example) - for y_multilabel, y_multiclass in mix_clf_format: assert_raises(ValueError, unique_labels, y_multiclass, y_multilabel) assert_raises(ValueError, unique_labels, y_multilabel, y_multiclass) From 7699100be87763ef49ac75fa1298029f185e4377 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 25 Jun 2013 14:09:30 +0200 Subject: [PATCH 13/16] PY3K use six.string_type --- sklearn/utils/multiclass.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 3e004f20655b4..2b7cc35e4ae11 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -92,10 +92,10 @@ def unique_labels(*ys): # Check that we don't mix string type with number type if ((label_type in ("binary", "multiclass") and - len(set([isinstance(x, basestring) + len(set([isinstance(x, string_types) for y in ys for x in y])) > 1) or (label_type == "multilabel-sequences" and - len(set.union(*[set(imap(lambda x: isinstance(x, basestring), + len(set.union(*[set(imap(lambda x: isinstance(x, string_types), chain(*y))) for y in ys])) > 1)): raise ValueError("Mix of label input types (string and number)") From 109f2fb630e45b3214429ae4c8bdac034066236d Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 25 Jun 2013 14:33:54 +0200 Subject: [PATCH 14/16] OPTIM + ENH simplify mix string and number check --- sklearn/utils/multiclass.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 2b7cc35e4ae11..4f192191fcf54 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -16,15 +16,18 @@ def _unique_multiclass(y): - return np.unique(y) + if isinstance(y, np.ndarray): + return np.unique(y) + else: + return set(y) def _unique_sequence_of_sequence(y): - return np.array(sorted(set(chain(*y)))) + return set(chain.from_iterable(y)) def _unique_indicator(y): - return np.arange(y.shape[1]) + return np.arange(y.shape[0]) _FN_UNIQUE_LABELS = { @@ -90,22 +93,18 @@ def unique_labels(*ys): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") - # Check that we don't mix string type with number type - if ((label_type in ("binary", "multiclass") and - len(set([isinstance(x, string_types) - for y in ys for x in y])) > 1) or - (label_type == "multilabel-sequences" and - len(set.union(*[set(imap(lambda x: isinstance(x, string_types), - chain(*y))) for y in ys])) > 1)): - raise ValueError("Mix of label input types (string and number)") - - # Get the proper unique function for the given format + # Get the unique set of labels _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) if not _unique_labels: raise ValueError("Unknown label type") - # Combine labels - return np.unique(np.hstack(_unique_labels(y) for y in ys)) + y_labels = set(chain.from_iterable(imap(_unique_labels, ys))) + + # Check that we don't mix string type with number type + if (len(set(isinstance(label, string_types) for label in y_labels)) > 1): + raise ValueError("Mix of label input types (string and number)") + + return np.array(sorted(y_labels)) def _is_integral_float(y): From a9197f3dad8b6afb26419fe84d8bee4810be92e8 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 25 Jun 2013 14:40:51 +0200 Subject: [PATCH 15/16] FIX bug with indicator format --- sklearn/utils/multiclass.py | 8 ++++---- sklearn/utils/tests/test_multiclass.py | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 4f192191fcf54..419b38a241ced 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -27,7 +27,7 @@ def _unique_sequence_of_sequence(y): def _unique_indicator(y): - return np.arange(y.shape[0]) + return np.arange(y.shape[1]) _FN_UNIQUE_LABELS = { @@ -98,13 +98,13 @@ def unique_labels(*ys): if not _unique_labels: raise ValueError("Unknown label type") - y_labels = set(chain.from_iterable(imap(_unique_labels, ys))) + ys_labels = set(chain.from_iterable(imap(_unique_labels, ys))) # Check that we don't mix string type with number type - if (len(set(isinstance(label, string_types) for label in y_labels)) > 1): + if (len(set(isinstance(label, string_types) for label in ys_labels)) > 1): raise ValueError("Mix of label input types (string and number)") - return np.array(sorted(y_labels)) + return np.array(sorted(ys_labels)) def _is_integral_float(y): diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 3cd1f1f197171..775a3a8a2248e 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -117,7 +117,6 @@ ] - def test_unique_labels(): # Empty iterable assert_raises(ValueError, unique_labels) @@ -137,6 +136,10 @@ def test_unique_labels(): [0, 0, 0]])), np.arange(3)) + assert_array_equal(unique_labels(np.array([[0, 0, 1], + [0, 0, 0]])), + np.arange(3)) + # Several arrays passed assert_array_equal(unique_labels([4, 0, 2], xrange(5)), np.arange(5)) From a62abe66ff14fc60ddeea38e3d527baebe654463 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 25 Jun 2013 14:49:00 +0200 Subject: [PATCH 16/16] ENH use a comprehension over imap --- sklearn/utils/multiclass.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 419b38a241ced..11289c433bf2d 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -8,7 +8,6 @@ """ from collections import Sequence from itertools import chain -from itertools import imap import numpy as np @@ -98,7 +97,7 @@ def unique_labels(*ys): if not _unique_labels: raise ValueError("Unknown label type") - ys_labels = set(chain.from_iterable(imap(_unique_labels, ys))) + ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) # Check that we don't mix string type with number type if (len(set(isinstance(label, string_types) for label in ys_labels)) > 1):