diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index dbf8126f8a889..b772d5d958d85 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -9,7 +9,9 @@ import warnings from abc import ABCMeta, abstractmethod +import array import numpy as np +import scipy.sparse as sp from scipy.sparse import csr_matrix, issparse from .ball_tree import BallTree @@ -615,15 +617,40 @@ def fit(self, X, y): else: self.outputs_2d_ = True - self.classes_ = [] - self._y = np.empty(y.shape, dtype=np.int) - for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes) + self.sparse_target_input_ = sp.issparse(y) - if not self.outputs_2d_: - self.classes_ = self.classes_[0] - self._y = self._y.ravel() + if not sp.issparse(y): + self.classes_ = [] + self._y = np.empty(y.shape, dtype=np.int) + for k in range(self._y.shape[1]): + classes, self._y[:, k] = np.unique(y[:, k], + return_inverse=True) + self.classes_.append(classes) + + if not self.outputs_2d_: + self.classes_ = self.classes_[0] + self._y = self._y.ravel() + else: + y = y.tocsc() + y.eliminate_zeros() + nnz = np.diff(y.indptr) + data = array.array('i') + self.classes_ = [] + + for k in range(y.shape[1]): + k_col_data = y.data[y.indptr[k]:y.indptr[k + 1]] + classes, data_k = np.unique(k_col_data, return_inverse=True) + + if not nnz[k] == y.shape[0]: + classes = np.insert(classes, 0, 0) + data_k += 1 + self.classes_.append(classes) + data.extend(data_k) + + _y = sp.csc_matrix((data, y.indices, y.indptr), shape=y.shape, + dtype=int) + + self._y = _y return self._fit(X) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 36d7c07108da0..019bc8dabef4e 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -7,15 +7,20 @@ # Multi-output support by Arnaud Joly # # License: BSD 3 clause (C) INRIA, University of Amsterdam - +import array import numpy as np +import scipy.sparse as sp + from scipy import stats from ..utils.extmath import weighted_mode -from .base import \ - _check_weights, _get_weights, \ - NeighborsBase, KNeighborsMixin,\ - RadiusNeighborsMixin, SupervisedIntegerMixin +from .base import _check_weights +from .base import _get_weights +from .base import NeighborsBase +from .base import KNeighborsMixin +from .base import RadiusNeighborsMixin +from .base import SupervisedIntegerMixin + from ..base import ClassifierMixin from ..utils import check_array @@ -146,18 +151,42 @@ def predict(self, X): n_samples = X.shape[0] weights = _get_weights(neigh_dist, self.weights) - y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) - for k, classes_k in enumerate(classes_): - if weights is None: - mode, _ = stats.mode(_y[neigh_ind, k], axis=1) - else: - mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) + if not self.sparse_target_input_: + y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) + for k, classes_k in enumerate(classes_): + if weights is None: + mode, _ = stats.mode(_y[neigh_ind, k], axis=1) + else: + mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) - mode = np.asarray(mode.ravel(), dtype=np.intp) - y_pred[:, k] = classes_k.take(mode) + mode = np.asarray(mode.ravel(), dtype=np.intp) + y_pred[:, k] = classes_k.take(mode) - if not self.outputs_2d_: - y_pred = y_pred.ravel() + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + else: + + data = [] + indices = array.array('i') + indptr = array.array('i', [0]) + + for k, classes_k in enumerate(classes_): + neigh_lbls_k = _y.getcol(k).toarray().ravel()[neigh_ind] + neigh_lbls_k = classes_k[neigh_lbls_k] + + if weights is None: + mode, _ = stats.mode(neigh_lbls_k, axis=1) + else: + mode, _ = weighted_mode(neigh_lbls_k, weights, axis=1) + + data.extend(mode[mode != 0]) + indices.extend(np.where(mode != 0)[0]) + indptr.append(len(indices)) + + y_pred = sp.csc_matrix((data, indices, indptr), + (n_samples, n_outputs), + dtype=classes_[0].dtype) return y_pred @@ -182,6 +211,10 @@ def predict_proba(self, X): classes_ = self.classes_ _y = self._y + + if self.sparse_target_input_: + _y = _y.toarray() + if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 4dcb45c488ea1..0c635005fa875 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -209,7 +209,6 @@ def test_kneighbors_classifier_predict_proba(): assert_array_almost_equal(real_prob, y_prob) - def test_radius_neighbors_classifier(n_samples=40, n_features=5, n_test_pts=10, @@ -849,6 +848,58 @@ def test_callable_metric(): assert_array_almost_equal(dist1, dist2) +def test_kneighbors_classifier_sparse_target_multioutput(): + """Test k-NN classifier on multioutput data with sparse target data""" + rng = check_random_state(0) + n_features = 5 + n_samples = 50 + n_output = 4 + + X = rng.rand(n_samples, n_features) + + # Consturct target data so that we cover two cases label encoding + # case 1: classes are not a 0 to n sequence + y_fst = rng.randint(1, 4, (n_samples, n_output//2)).astype(float) + # case 2: classes line up with their integer encoding + y_snd = rng.randint(0, 3, (n_samples, n_output//2)).astype(float) + y = np.hstack((y_fst, y_snd)) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + y_train = csc_matrix(y_train) + + weights = [None, 'uniform', 'distance', _weight_func] + + for algorithm, weights in product(ALGORITHMS, weights): + # Stack single output prediction + y_pred_so = [] + y_pred_proba_so = [] + for o in range(n_output): + knn = neighbors.KNeighborsClassifier(weights=weights, + algorithm=algorithm) + knn.fit(X_train, y_train.getcol(o).toarray().ravel()) + y_pred_so.append(knn.predict(X_test)) + y_pred_proba_so.append(knn.predict_proba(X_test)) + + y_pred_so = np.vstack(y_pred_so).T + assert_equal(y_pred_so.shape, y_test.shape) + assert_equal(len(y_pred_proba_so), n_output) + + # Multioutput prediction + knn_mo = neighbors.KNeighborsClassifier(weights=weights, + algorithm=algorithm) + knn_mo.fit(X_train, y_train) + y_pred_mo = knn_mo.predict(X_test) + + assert_equal(y_pred_mo.dtype, float) + assert_array_equal(y_pred_mo.toarray(), y_pred_so) + + # Check proba + y_pred_proba_mo = knn_mo.predict_proba(X_test) + assert_equal(len(y_pred_proba_mo), n_output) + + for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so): + assert_array_almost_equal(proba_mo, proba_so) + if __name__ == '__main__': import nose nose.runmodule()