FIX make shuffle / resample pass-through indexing utilities

ogrisel · ogrisel · commit 3001e6dd8203 · 2015-04-10T16:15:00.000-04:00
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -169,7 +169,9 @@ def resample(*arrays, **options):
 
     Parameters
     ----------
-    *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
+    *arrays : sequence of indexable data-structures
+        Indexable data-structures can be arrays, lists, dataframes or scipy
+        sparse matrices with consistent first dimension.
 
     replace : boolean, True by default
         Implements resampling with replacement. If False, this will implement
@@ -184,16 +186,15 @@ def resample(*arrays, **options):
 
     Returns
     -------
-    resampled_arrays : sequence of arrays or scipy.sparse matrices with same \
-    shape[0]
-        Sequence of resampled views of the collections. The original arrays are 
+    resampled_arrays : sequence of indexable data-structures
+        Sequence of resampled views of the collections. The original arrays are
         not impacted.
 
     Examples
     --------
     It is possible to mix sparse and dense arrays in the same run::
 
-      >>> X = [[1., 0.], [2., 1.], [0., 0.]]
+      >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
       >>> y = np.array([0, 1, 2])
 
       >>> from scipy.sparse import coo_matrix
@@ -247,8 +248,6 @@ def resample(*arrays, **options):
             max_n_samples, n_samples))
 
     check_consistent_length(*arrays)
-    arrays = [check_array(x, accept_sparse='csr', ensure_2d=False,
-                          allow_nd=True) for x in arrays]
 
     if replace:
         indices = random_state.randint(0, n_samples, size=(max_n_samples,))
@@ -257,12 +256,9 @@ def resample(*arrays, **options):
         random_state.shuffle(indices)
         indices = indices[:max_n_samples]
 
-    resampled_arrays = []
-
-    for array in arrays:
-        array = array[indices]
-        resampled_arrays.append(array)
-
+    # convert sparse matrices to CSR for row-based indexing
+    arrays = [a.tocsr() if issparse(a) else a for a in arrays]
+    resampled_arrays = [safe_indexing(a, indices) for a in arrays]
     if len(resampled_arrays) == 1:
         # syntactic sugar for the unit argument case
         return resampled_arrays[0]
@@ -278,7 +274,9 @@ def shuffle(*arrays, **options):
 
     Parameters
     ----------
-    *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
+    *arrays : sequence of indexable data-structures
+        Indexable data-structures can be arrays, lists, dataframes or scipy
+        sparse matrices with consistent first dimension.
 
     random_state : int or RandomState instance
         Control the shuffling for reproducible behavior.
@@ -289,16 +287,15 @@ def shuffle(*arrays, **options):
 
     Returns
     -------
-    shuffled_arrays : sequence of arrays or scipy.sparse matrices with same \
-    shape[0]
+    shuffled_arrays : sequence of indexable data-structures
         Sequence of shuffled views of the collections. The original arrays are
         not impacted.
 
     Examples
     --------
     It is possible to mix sparse and dense arrays in the same run::
 
-      >>> X = [[1., 0.], [2., 1.], [0., 0.]]
+      >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
       >>> y = np.array([0, 1, 2])
 
       >>> from scipy.sparse import coo_matrix
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
@@ -186,3 +186,36 @@ def to_tuple(A):    # to make the inner arrays hashable
     S = set(to_tuple(A))
     shuffle(A)  # shouldn't raise a ValueError for dim = 3
     assert_equal(set(to_tuple(A)), S)
+
+
+def test_shuffle_dont_convert_to_array():
+    # Check that shuffle does not try to convert to numpy arrays with float
+    # dtypes can let any indexable datastructure pass-through.
+    a = ['a', 'b', 'c']
+    b = np.array(['a', 'b', 'c'], dtype=object)
+    c = [1, 2, 3]
+    d = MockDataFrame(np.array([['a', 0],
+                                ['b', 1],
+                                ['c', 2]],
+                      dtype=object))
+    e = sp.csc_matrix(np.arange(6).reshape(3, 2))
+    a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
+
+    assert_equal(a_s, ['c', 'b', 'a'])
+    assert_equal(type(a_s), list)
+
+    assert_array_equal(b_s, ['c', 'b', 'a'])
+    assert_equal(b_s.dtype, object)
+
+    assert_equal(c_s, [3, 2, 1])
+    assert_equal(type(c_s), list)
+
+    assert_array_equal(d_s, np.array([['c', 2],
+                                      ['b', 1],
+                                      ['a', 0]],
+                                     dtype=object))
+    assert_equal(type(d_s), MockDataFrame)
+
+    assert_array_equal(e_s.toarray(), np.array([[4, 5],
+                                                [2, 3],
+                                                [0, 1]]))