From fa2adcf61576ed0a86c829f449f951a75db3f086 Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Wed, 12 Jun 2019 16:47:04 +0200
Subject: [PATCH 01/13] Reduce memory footprint when using stochastic
 optimizers with shuffle

---
 sklearn/neural_network/multilayer_perceptron.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 8a5469df54897..aabfcbaa95764 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -498,6 +498,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
             y_val = None
 
         n_samples = X.shape[0]
+        idx = np.arange(n_samples, dtype=int)
 
         if self.batch_size == 'auto':
             batch_size = min(200, n_samples)
@@ -507,12 +508,12 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
         try:
             for it in range(self.max_iter):
                 if self.shuffle:
-                    X, y = shuffle(X, y, random_state=self._random_state)
+                    idx = shuffle(idx, random_state=self._random_state)
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
-                    activations[0] = X[batch_slice]
+                    activations[0] = X[idx[batch_slice]]
                     batch_loss, coef_grads, intercept_grads = self._backprop(
-                        X[batch_slice], y[batch_slice], activations, deltas,
+                        X[idx[batch_slice]], y[idx[batch_slice]], activations, deltas,
                         coef_grads, intercept_grads)
                     accumulated_loss += batch_loss * (batch_slice.stop -
                                                       batch_slice.start)

From a575c726981bc5f783eead8f6470e7dc8a7e8a2f Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Wed, 12 Jun 2019 17:02:58 +0200
Subject: [PATCH 02/13] Reduce the line length for flake8

---
 sklearn/neural_network/multilayer_perceptron.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index aabfcbaa95764..0d3cb2e4f4db5 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -513,7 +513,8 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                 for batch_slice in gen_batches(n_samples, batch_size):
                     activations[0] = X[idx[batch_slice]]
                     batch_loss, coef_grads, intercept_grads = self._backprop(
-                        X[idx[batch_slice]], y[idx[batch_slice]], activations, deltas,
+                        X[idx[batch_slice]], y[idx[batch_slice]],
+                        activations, deltas,
                         coef_grads, intercept_grads)
                     accumulated_loss += batch_loss * (batch_slice.stop -
                                                       batch_slice.start)

From e66cfa66325d89900b8457bb84383fe1f358c426 Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Mon, 17 Jun 2019 15:09:59 +0200
Subject: [PATCH 03/13] Apply patch for fixed unit tests. Drop
 scipy.sparse.coo_matrix support

---
 sklearn/neural_network/multilayer_perceptron.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 0d3cb2e4f4db5..349117ae46029 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -21,6 +21,7 @@
 from ..utils import gen_batches, check_random_state
 from ..utils import shuffle
 from ..utils import check_array, check_X_y, column_or_1d
+from ..utils import safe_indexing
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
 from ..utils.validation import check_is_fitted
@@ -511,9 +512,10 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                     idx = shuffle(idx, random_state=self._random_state)
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
-                    activations[0] = X[idx[batch_slice]]
+                    activations[0] = safe_indexing(X, idx[batch_slice])
                     batch_loss, coef_grads, intercept_grads = self._backprop(
-                        X[idx[batch_slice]], y[idx[batch_slice]],
+                        safe_indexing(X, idx[batch_slice]),
+                        y[idx[batch_slice]],
                         activations, deltas,
                         coef_grads, intercept_grads)
                     accumulated_loss += batch_loss * (batch_slice.stop -
@@ -661,7 +663,7 @@ def _predict(self, X):
         y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
             The decision function of the samples for each class in the model.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = check_array(X, accept_sparse=['csr', 'csc'])
 
         # Make sure self.hidden_layer_sizes is a list
         hidden_layer_sizes = self.hidden_layer_sizes
@@ -917,7 +919,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
             n_iter_no_change=n_iter_no_change)
 
     def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
                          multi_output=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)
@@ -1317,7 +1319,7 @@ def predict(self, X):
         return y_pred
 
     def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
                          multi_output=True, y_numeric=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)

From 85bbab5bb246b52fe90ff6a00df707fe87a7f76d Mon Sep 17 00:00:00 2001
From: Marian Meyer <marian.meyer@bonding.de>
Date: Thu, 20 Jun 2019 21:55:19 +0200
Subject: [PATCH 04/13] Do not use safe_indexing due to low speed. Handle
 Pandas DataFrame inside of check_X_y instead.

---
 sklearn/neural_network/multilayer_perceptron.py | 14 +++++++++++---
 sklearn/utils/validation.py                     |  4 ++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 349117ae46029..8269b84cfebe6 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -512,10 +512,18 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                     idx = shuffle(idx, random_state=self._random_state)
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
-                    activations[0] = safe_indexing(X, idx[batch_slice])
+                    # only use integer indexing when it is needed, otherwise use fast-path
+                    if self.shuffle:
+                        X_batch = X[idx[batch_slice]]
+                        y_batch = y[idx[batch_size]]
+                    else:
+                        X_batch = X[batch_slice]
+                        y_batch = y[batch_slice]
+
+                    activations[0] = X_batch
                     batch_loss, coef_grads, intercept_grads = self._backprop(
-                        safe_indexing(X, idx[batch_slice]),
-                        y[idx[batch_slice]],
+                        X_batch,
+                        y_batch,
                         activations, deltas,
                         coef_grads, intercept_grads)
                     accumulated_loss += batch_loss * (batch_slice.stop -
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 32cad0197317b..84c765d82f77a 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -449,6 +449,10 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
         dtypes_orig = np.array(array.dtypes)
 
+    # extract the underlying numpy ndarray from a Pandas DataFrame
+    if hasattr(array, "iloc"):
+        array = array.values
+
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":
             # if input is object, convert to float.

From 0400df21aa695afe35592c0dc5635d3d73e08d96 Mon Sep 17 00:00:00 2001
From: Marian Meyer <marian.meyer@bonding.de>
Date: Thu, 20 Jun 2019 21:58:49 +0200
Subject: [PATCH 05/13] Shorter comment for flake8

---
 sklearn/neural_network/multilayer_perceptron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 8269b84cfebe6..392048f3451a1 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -512,7 +512,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                     idx = shuffle(idx, random_state=self._random_state)
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
-                    # only use integer indexing when it is needed, otherwise use fast-path
+                    # only use integer indexing when it is needed
                     if self.shuffle:
                         X_batch = X[idx[batch_slice]]
                         y_batch = y[idx[batch_size]]

From d53194e0c5e06efd83e6f406988cde30d59d38ee Mon Sep 17 00:00:00 2001
From: Marian Meyer <marian.meyer@bonding.de>
Date: Thu, 20 Jun 2019 22:03:10 +0200
Subject: [PATCH 06/13] Do not import unused function safe_indexing

---
 sklearn/neural_network/multilayer_perceptron.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 392048f3451a1..69b27ebdb4570 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -21,7 +21,6 @@
 from ..utils import gen_batches, check_random_state
 from ..utils import shuffle
 from ..utils import check_array, check_X_y, column_or_1d
-from ..utils import safe_indexing
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
 from ..utils.validation import check_is_fitted

From df615a712dff464557e589cd7b4ef5fe7de1b337 Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Sat, 22 Jun 2019 12:55:42 +0200
Subject: [PATCH 07/13] Use safe_indexing without take (should fix tests)

---
 sklearn/neural_network/multilayer_perceptron.py | 9 ++++-----
 sklearn/utils/__init__.py                       | 8 ++------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 69b27ebdb4570..17c718c47c1a9 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -20,6 +20,7 @@
 from ..preprocessing import LabelBinarizer
 from ..utils import gen_batches, check_random_state
 from ..utils import shuffle
+from ..utils import safe_indexing
 from ..utils import check_array, check_X_y, column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
@@ -513,17 +514,15 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                 for batch_slice in gen_batches(n_samples, batch_size):
                     # only use integer indexing when it is needed
                     if self.shuffle:
-                        X_batch = X[idx[batch_slice]]
-                        y_batch = y[idx[batch_size]]
+                        X_batch = safe_indexing(X, idx[batch_slice])
+                        y_batch = y[idx[batch_slice]]
                     else:
                         X_batch = X[batch_slice]
                         y_batch = y[batch_slice]
 
                     activations[0] = X_batch
                     batch_loss, coef_grads, intercept_grads = self._backprop(
-                        X_batch,
-                        y_batch,
-                        activations, deltas,
+                        X_batch, y_batch, activations, deltas,
                         coef_grads, intercept_grads)
                     accumulated_loss += batch_loss * (batch_slice.stop -
                                                       batch_slice.start)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index daf7e7763235d..d5671181ddb99 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -213,12 +213,8 @@ def safe_indexing(X, indices):
                           DataConversionWarning)
             return X.copy().iloc[indices]
     elif hasattr(X, "shape"):
-        if hasattr(X, 'take') and (hasattr(indices, 'dtype') and
-                                   indices.dtype.kind == 'i'):
-            # This is often substantially faster than X[indices]
-            return X.take(indices, axis=0)
-        else:
-            return X[indices]
+        # just use fancy indexing, which is faster than numpy take (see PR 14075)
+        return X[indices]
     else:
         return [X[idx] for idx in indices]
 

From 2fa153571b3266bbd9d60dfb6d33bf195b470c05 Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Sat, 22 Jun 2019 13:38:18 +0200
Subject: [PATCH 08/13] Use slightly shorter comment for flake8

---
 sklearn/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index d5671181ddb99..456582c61e105 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -213,7 +213,7 @@ def safe_indexing(X, indices):
                           DataConversionWarning)
             return X.copy().iloc[indices]
     elif hasattr(X, "shape"):
-        # just use fancy indexing, which is faster than numpy take (see PR 14075)
+        # just use fancy indexing, which is faster than numpy take (PR 14075)
         return X[indices]
     else:
         return [X[idx] for idx in indices]

From 62f662d517f8e29673bdf595852f093eded90091 Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Mon, 24 Jun 2019 11:21:02 +0200
Subject: [PATCH 09/13] Remove not-required pandas Dataframe handling

---
 sklearn/utils/validation.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 84c765d82f77a..32cad0197317b 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -449,10 +449,6 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
         dtypes_orig = np.array(array.dtypes)
 
-    # extract the underlying numpy ndarray from a Pandas DataFrame
-    if hasattr(array, "iloc"):
-        array = array.values
-
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":
             # if input is object, convert to float.

From dd2e79693de45f3e01cabde96b83b0dab417689f Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Tue, 17 Dec 2019 13:44:04 +0100
Subject: [PATCH 10/13] Update import to avoid FutureWarning

---
 sklearn/neural_network/_multilayer_perceptron.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index df7583ec9ff5d..9684cb0b5018f 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -21,7 +21,7 @@
 from ..preprocessing import LabelBinarizer
 from ..utils import gen_batches, check_random_state
 from ..utils import shuffle
-from ..utils import safe_indexing
+from ..utils import _safe_indexing
 from ..utils import check_array, check_X_y, column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
@@ -519,7 +519,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                 for batch_slice in gen_batches(n_samples, batch_size):
                     # only use integer indexing when it is needed
                     if self.shuffle:
-                        X_batch = safe_indexing(X, idx[batch_slice])
+                        X_batch = _safe_indexing(X, idx[batch_slice])
                         y_batch = y[idx[batch_slice]]
                     else:
                         X_batch = X[batch_slice]

From 516958fafab1f5d925f90e9f1200e9fdd50a535b Mon Sep 17 00:00:00 2001
From: meyer89 <meyer@ifht.rwth-aachen.de>
Date: Mon, 13 Jan 2020 15:20:56 +0100
Subject: [PATCH 11/13] Apply suggestions from code review

Co-Authored-By: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/neural_network/_multilayer_perceptron.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 9684cb0b5018f..a3fdfbb921f11 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -504,7 +504,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
             y_val = None
 
         n_samples = X.shape[0]
-        idx = np.arange(n_samples, dtype=int)
+        sample_idx = np.arange(n_samples, dtype=int)
 
         if self.batch_size == 'auto':
             batch_size = min(200, n_samples)
@@ -514,13 +514,15 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
         try:
             for it in range(self.max_iter):
                 if self.shuffle:
-                    idx = shuffle(idx, random_state=self._random_state)
+                    # Only shuffle the sample indices instead of X and y to
+                    # reduce the memory footprint. These indices will be used
+                    # to slice the X and y.
+                    sample_idx = shuffle(sample_idx, random_state=self._random_state)
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
-                    # only use integer indexing when it is needed
                     if self.shuffle:
-                        X_batch = _safe_indexing(X, idx[batch_slice])
-                        y_batch = y[idx[batch_slice]]
+                        X_batch = _safe_indexing(X, sample_idx[batch_slice])
+                        y_batch = y[sample_idx[batch_slice]]
                     else:
                         X_batch = X[batch_slice]
                         y_batch = y[batch_slice]

From 05f1f551a50bb0c9da0399654919097173b82386 Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Mon, 13 Jan 2020 15:25:44 +0100
Subject: [PATCH 12/13] Reduce line length for flake8

---
 sklearn/neural_network/_multilayer_perceptron.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index e8805db8ccbf1..bf2f70de3df50 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -517,7 +517,9 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                     # Only shuffle the sample indices instead of X and y to
                     # reduce the memory footprint. These indices will be used
                     # to slice the X and y.
-                    sample_idx = shuffle(sample_idx, random_state=self._random_state)
+                    sample_idx = shuffle(sample_idx,
+                                         random_state=self._random_state)
+                
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
                     if self.shuffle:

From 3872d4e7256cd7ed186a543fcc90e419de0a88ff Mon Sep 17 00:00:00 2001
From: Marian Meyer <meyer@ifht.rwth-aachen.de>
Date: Mon, 13 Jan 2020 15:30:45 +0100
Subject: [PATCH 13/13] Remove whitespace in empty line (flake8)

---
 sklearn/neural_network/_multilayer_perceptron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index bf2f70de3df50..9cc66bedb46ce 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -519,7 +519,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                     # to slice the X and y.
                     sample_idx = shuffle(sample_idx,
                                          random_state=self._random_state)
-                
+
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
                     if self.shuffle: