diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 51af0e33139dd..9cc66bedb46ce 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -21,6 +21,7 @@
 from ..preprocessing import LabelBinarizer
 from ..utils import gen_batches, check_random_state
 from ..utils import shuffle
+from ..utils import _safe_indexing
 from ..utils import check_array, check_X_y, column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
@@ -503,6 +504,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
             y_val = None
 
         n_samples = X.shape[0]
+        sample_idx = np.arange(n_samples, dtype=int)
 
         if self.batch_size == 'auto':
             batch_size = min(200, n_samples)
@@ -512,12 +514,24 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
         try:
             for it in range(self.max_iter):
                 if self.shuffle:
-                    X, y = shuffle(X, y, random_state=self._random_state)
+                    # Only shuffle the sample indices instead of X and y to
+                    # reduce the memory footprint. These indices will be used
+                    # to slice the X and y.
+                    sample_idx = shuffle(sample_idx,
+                                         random_state=self._random_state)
+
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
-                    activations[0] = X[batch_slice]
+                    if self.shuffle:
+                        X_batch = _safe_indexing(X, sample_idx[batch_slice])
+                        y_batch = y[sample_idx[batch_slice]]
+                    else:
+                        X_batch = X[batch_slice]
+                        y_batch = y[batch_slice]
+
+                    activations[0] = X_batch
                     batch_loss, coef_grads, intercept_grads = self._backprop(
-                        X[batch_slice], y[batch_slice], activations, deltas,
+                        X_batch, y_batch, activations, deltas,
                         coef_grads, intercept_grads)
                     accumulated_loss += batch_loss * (batch_slice.stop -
                                                       batch_slice.start)
@@ -664,7 +678,7 @@ def _predict(self, X):
         y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
             The decision function of the samples for each class in the model.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = check_array(X, accept_sparse=['csr', 'csc'])
 
         # Make sure self.hidden_layer_sizes is a list
         hidden_layer_sizes = self.hidden_layer_sizes
@@ -928,7 +942,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
             n_iter_no_change=n_iter_no_change, max_fun=max_fun)
 
     def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
                          multi_output=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)
@@ -1336,7 +1350,7 @@ def predict(self, X):
         return y_pred
 
     def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
                          multi_output=True, y_numeric=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)