Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 47 additions & 47 deletions sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,77 +82,77 @@ def _check_fitted(self):
if not hasattr(self, "classes_"):
raise ValueError("LabelEncoder was not fitted yet.")

def fit(self, y):
def fit(self, X, y=None):
"""Fit label encoder

Parameters
----------
y : array-like of shape (n_samples,)
X : array-like of shape (n_samples,)
Target values.

Returns
-------
self : returns an instance of self.
"""
y = column_or_1d(y, warn=True)
_check_numpy_unicode_bug(y)
self.classes_ = np.unique(y)
X = column_or_1d(X, warn=True)
_check_numpy_unicode_bug(X)
self.classes_ = np.unique(X)
return self

def fit_transform(self, y):
def fit_transform(self, X, y=None):
"""Fit label encoder and return encoded labels

Parameters
----------
y : array-like of shape [n_samples]
X : array-like of shape [n_samples]
Target values.

Returns
-------
y : array-like of shape [n_samples]
X : array-like of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
_check_numpy_unicode_bug(y)
self.classes_, y = np.unique(y, return_inverse=True)
return y
X = column_or_1d(X, warn=True)
_check_numpy_unicode_bug(X)
self.classes_, X = np.unique(X, return_inverse=True)
return X

def transform(self, y):
def transform(self, X, y=None):
"""Transform labels to normalized encoding.

Parameters
----------
y : array-like of shape [n_samples]
X : array-like of shape [n_samples]
Target values.

Returns
-------
y : array-like of shape [n_samples]
X : array-like of shape [n_samples]
"""
self._check_fitted()

classes = np.unique(y)
classes = np.unique(X)
_check_numpy_unicode_bug(classes)
if len(np.intersect1d(classes, self.classes_)) < len(classes):
diff = np.setdiff1d(classes, self.classes_)
raise ValueError("y contains new labels: %s" % str(diff))
return np.searchsorted(self.classes_, y)
return np.searchsorted(self.classes_, X)

def inverse_transform(self, y):
def inverse_transform(self, X):
"""Transform labels back to original encoding.

Parameters
----------
y : numpy array of shape [n_samples]
X : numpy array of shape [n_samples]
Target values.

Returns
-------
y : numpy array of shape [n_samples]
X : numpy array of shape [n_samples]
"""
self._check_fitted()

y = np.asarray(y)
return self.classes_[y]
X = np.asarray(X)
return self.classes_[X]


class LabelBinarizer(BaseEstimator, TransformerMixin):
Expand Down Expand Up @@ -236,79 +236,79 @@ def _check_fitted(self):
if not hasattr(self, "classes_"):
raise ValueError("LabelBinarizer was not fitted yet.")

def fit(self, y):
def fit(self, X, y=None):
"""Fit label binarizer

Parameters
----------
y : numpy array of shape (n_samples,) or sequence of sequences
X : numpy array of shape (n_samples,) or sequence of sequences
Target values. In the multilabel case the nested sequences can
have variable lengths.

Returns
-------
self : returns an instance of self.
"""
y_type = type_of_target(y)
self.multilabel_ = y_type.startswith('multilabel')
x_type = type_of_target(X)
self.multilabel_ = x_type.startswith('multilabel')
if self.multilabel_:
self.indicator_matrix_ = y_type == 'multilabel-indicator'
self.indicator_matrix_ = x_type == 'multilabel-indicator'

self.classes_ = unique_labels(y)
self.classes_ = unique_labels(X)

return self

def transform(self, y):
def transform(self, X, y=None):
"""Transform multi-class labels to binary labels

The output of transform is sometimes referred to by some authors as the
1-of-K coding scheme.

Parameters
----------
y : numpy array of shape [n_samples] or sequence of sequences
X : numpy array of shape [n_samples] or sequence of sequences
Target values. In the multilabel case the nested sequences can
have variable lengths.

Returns
-------
Y : numpy array of shape [n_samples, n_classes]
x : numpy array of shape [n_samples, n_classes]
"""
self._check_fitted()

y_is_multilabel = type_of_target(y).startswith('multilabel')
x_is_multilabel = type_of_target(X).startswith('multilabel')

if y_is_multilabel and not self.multilabel_:
if x_is_multilabel and not self.multilabel_:
raise ValueError("The object was not fitted with multilabel"
" input.")

return label_binarize(y, self.classes_,
return label_binarize(X, self.classes_,
multilabel=self.multilabel_,
pos_label=self.pos_label,
neg_label=self.neg_label)

def inverse_transform(self, Y, threshold=None):
def inverse_transform(self, X, threshold=None):
"""Transform binary labels back to multi-class labels

Parameters
----------
Y : numpy array of shape [n_samples, n_classes]
X : numpy array of shape [n_samples, n_classes]
Target values.

threshold : float or None
Threshold used in the binary and multi-label cases.

Use 0 when:
- Y contains the output of decision_function (classifier)
- X contains the output of decision_function (classifier)
Use 0.5 when:
- Y contains the output of predict_proba
- X contains the output of predict_proba

If None, the threshold is assumed to be half way between
neg_label and pos_label.

Returns
-------
y : numpy array of shape [n_samples] or sequence of sequences
x : numpy array of shape [n_samples] or sequence of sequences
Target values. In the multilabel case the nested sequences can
have variable lengths.

Expand All @@ -327,23 +327,23 @@ def inverse_transform(self, Y, threshold=None):
threshold = self.neg_label + half

if self.multilabel_:
Y = np.array(Y > threshold, dtype=int)
X = np.array(X > threshold, dtype=int)
# Return the predictions in the same format as in fit
if self.indicator_matrix_:
# Label indicator matrix format
return Y
return X
else:
# Lists of tuples format
return [tuple(self.classes_[np.flatnonzero(Y[i])])
for i in range(Y.shape[0])]
return [tuple(self.classes_[np.flatnonzero(X[i])])
for i in range(X.shape[0])]

if len(Y.shape) == 1 or Y.shape[1] == 1:
y = np.array(Y.ravel() > threshold, dtype=int)
if len(X.shape) == 1 or X.shape[1] == 1:
x = np.array(X.ravel() > threshold, dtype=int)

else:
y = Y.argmax(axis=1)
x = X.argmax(axis=1)

return self.classes_[y]
return self.classes_[x]


def label_binarize(y, classes, multilabel=False, neg_label=0, pos_label=1):
Expand Down
28 changes: 28 additions & 0 deletions sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from sklearn import datasets
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.pipeline import Pipeline

iris = datasets.load_iris()

Expand Down Expand Up @@ -253,3 +254,30 @@ def test_label_binarize_with_multilabel_indicator():

output = lb.fit(y).transform(y)
assert_array_equal(output, expected)


def test_label_binarizer_with_pipeline():
lb = LabelBinarizer()
pipeline = Pipeline([
('binarize', lb)
])

inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
pipeline.fit(inp)
got = pipeline.transform(inp)
assert_array_equal(expected, got)
assert_array_equal(pipeline.inverse_transform(got), inp)


def test_label_encoder_with_pipeline():
le = LabelEncoder()
pipeline = Pipeline([
('encode', le)
])

pipeline.fit(np.array([1, 1, 4, 5, -1, 0]))
assert_array_equal(pipeline.transform(np.array([0, 1, 4, 4, 5, -1, -1])),
np.array([1, 2, 3, 3, 4, 0, 0]))
assert_array_equal(pipeline.inverse_transform(np.array([1, 2, 3, 3, 4, 0, 0])),
np.array([[0, 1, 4, 4, 5, -1, -1]]))