diff --git a/conftest.py b/conftest.py
index 73326d6d2e32b..0a7c4a1d8137b 100644
--- a/conftest.py
+++ b/conftest.py
@@ -88,3 +88,13 @@ def pytest_runtest_setup(item):
 def pytest_runtest_teardown(item, nextitem):
     if isinstance(item, DoctestItem):
         set_config(print_changed_only=False)
+
+
+# We don't want pytest to run these files since they immediately raise a
+# DeprecationWarning and that would make CI unhappy.
+# These files don't contain any real code, they just raise a warning. We still
+# ensure these warnings are properly raised in the tests.
+collect_ignore_glob = [
+    "sklearn/neural_network/rbm.py",  # 0.24
+    "sklearn/neural_network/multilayer_perceptron.py",  # 0.24
+]
diff --git a/sklearn/neural_network/__init__.py b/sklearn/neural_network/__init__.py
index 470c06556aae6..722b1453e08ec 100644
--- a/sklearn/neural_network/__init__.py
+++ b/sklearn/neural_network/__init__.py
@@ -5,10 +5,10 @@
 
 # License: BSD 3 clause
 
-from .rbm import BernoulliRBM
+from ._rbm import BernoulliRBM
 
-from .multilayer_perceptron import MLPClassifier
-from .multilayer_perceptron import MLPRegressor
+from ._multilayer_perceptron import MLPClassifier
+from ._multilayer_perceptron import MLPRegressor
 
 __all__ = ["BernoulliRBM",
            "MLPClassifier",
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
new file mode 100644
index 0000000000000..b6367d32e57a9
--- /dev/null
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -0,0 +1,1343 @@
+"""Multi-layer Perceptron
+"""
+
+# Authors: Issam H. Laradji <issam.laradji@gmail.com>
+#          Andreas Mueller
+#          Jiyuan Qian
+# License: BSD 3 clause
+
+import numpy as np
+
+from abc import ABCMeta, abstractmethod
+import warnings
+
+import scipy.optimize
+
+from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
+from ..base import is_classifier
+from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
+from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer
+from ..model_selection import train_test_split
+from ..preprocessing import LabelBinarizer
+from ..utils import gen_batches, check_random_state
+from ..utils import shuffle
+from ..utils import check_array, check_X_y, column_or_1d
+from ..exceptions import ConvergenceWarning
+from ..utils.extmath import safe_sparse_dot
+from ..utils.validation import check_is_fitted
+from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
+from ..utils.multiclass import type_of_target
+from ..utils.optimize import _check_optimize_result
+
+
+_STOCHASTIC_SOLVERS = ['sgd', 'adam']
+
+
+def _pack(coefs_, intercepts_):
+    """Pack the parameters into a single vector."""
+    return np.hstack([l.ravel() for l in coefs_ + intercepts_])
+
+
+class BaseMultilayerPerceptron(BaseEstimator, metaclass=ABCMeta):
+    """Base class for MLP classification and regression.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+
+    .. versionadded:: 0.18
+    """
+
+    @abstractmethod
+    def __init__(self, hidden_layer_sizes, activation, solver,
+                 alpha, batch_size, learning_rate, learning_rate_init, power_t,
+                 max_iter, loss, shuffle, random_state, tol, verbose,
+                 warm_start, momentum, nesterovs_momentum, early_stopping,
+                 validation_fraction, beta_1, beta_2, epsilon,
+                 n_iter_no_change, max_fun):
+        self.activation = activation
+        self.solver = solver
+        self.alpha = alpha
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.learning_rate_init = learning_rate_init
+        self.power_t = power_t
+        self.max_iter = max_iter
+        self.loss = loss
+        self.hidden_layer_sizes = hidden_layer_sizes
+        self.shuffle = shuffle
+        self.random_state = random_state
+        self.tol = tol
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.momentum = momentum
+        self.nesterovs_momentum = nesterovs_momentum
+        self.early_stopping = early_stopping
+        self.validation_fraction = validation_fraction
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.n_iter_no_change = n_iter_no_change
+        self.max_fun = max_fun
+
+    def _unpack(self, packed_parameters):
+        """Extract the coefficients and intercepts from packed_parameters."""
+        for i in range(self.n_layers_ - 1):
+            start, end, shape = self._coef_indptr[i]
+            self.coefs_[i] = np.reshape(packed_parameters[start:end], shape)
+
+            start, end = self._intercept_indptr[i]
+            self.intercepts_[i] = packed_parameters[start:end]
+
+    def _forward_pass(self, activations):
+        """Perform a forward pass on the network by computing the values
+        of the neurons in the hidden layers and the output layer.
+
+        Parameters
+        ----------
+        activations : list, length = n_layers - 1
+            The ith element of the list holds the values of the ith layer.
+        """
+        hidden_activation = ACTIVATIONS[self.activation]
+        # Iterate over the hidden layers
+        for i in range(self.n_layers_ - 1):
+            activations[i + 1] = safe_sparse_dot(activations[i],
+                                                 self.coefs_[i])
+            activations[i + 1] += self.intercepts_[i]
+
+            # For the hidden layers
+            if (i + 1) != (self.n_layers_ - 1):
+                activations[i + 1] = hidden_activation(activations[i + 1])
+
+        # For the last layer
+        output_activation = ACTIVATIONS[self.out_activation_]
+        activations[i + 1] = output_activation(activations[i + 1])
+
+        return activations
+
+    def _compute_loss_grad(self, layer, n_samples, activations, deltas,
+                           coef_grads, intercept_grads):
+        """Compute the gradient of loss with respect to coefs and intercept for
+        specified layer.
+
+        This function does backpropagation for the specified one layer.
+        """
+        coef_grads[layer] = safe_sparse_dot(activations[layer].T,
+                                            deltas[layer])
+        coef_grads[layer] += (self.alpha * self.coefs_[layer])
+        coef_grads[layer] /= n_samples
+
+        intercept_grads[layer] = np.mean(deltas[layer], 0)
+
+        return coef_grads, intercept_grads
+
+    def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
+                         coef_grads, intercept_grads):
+        """Compute the MLP loss function and its corresponding derivatives
+        with respect to the different parameters given in the initialization.
+
+        Returned gradients are packed in a single vector so it can be used
+        in lbfgs
+
+        Parameters
+        ----------
+        packed_coef_inter : array-like
+            A vector comprising the flattened coefficients and intercepts.
+
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        y : array-like, shape (n_samples,)
+            The target values.
+
+        activations : list, length = n_layers - 1
+            The ith element of the list holds the values of the ith layer.
+
+        deltas : list, length = n_layers - 1
+            The ith element of the list holds the difference between the
+            activations of the i + 1 layer and the backpropagated error.
+            More specifically, deltas are gradients of loss with respect to z
+            in each layer, where z = wx + b is the value of a particular layer
+            before passing through the activation function
+
+        coef_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            coefficient parameters of the ith layer in an iteration.
+
+        intercept_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            intercept parameters of the ith layer in an iteration.
+
+        Returns
+        -------
+        loss : float
+        grad : array-like, shape (number of nodes of all layers,)
+        """
+        self._unpack(packed_coef_inter)
+        loss, coef_grads, intercept_grads = self._backprop(
+            X, y, activations, deltas, coef_grads, intercept_grads)
+        grad = _pack(coef_grads, intercept_grads)
+        return loss, grad
+
+    def _backprop(self, X, y, activations, deltas, coef_grads,
+                  intercept_grads):
+        """Compute the MLP loss function and its corresponding derivatives
+        with respect to each parameter: weights and bias vectors.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        y : array-like, shape (n_samples,)
+            The target values.
+
+        activations : list, length = n_layers - 1
+             The ith element of the list holds the values of the ith layer.
+
+        deltas : list, length = n_layers - 1
+            The ith element of the list holds the difference between the
+            activations of the i + 1 layer and the backpropagated error.
+            More specifically, deltas are gradients of loss with respect to z
+            in each layer, where z = wx + b is the value of a particular layer
+            before passing through the activation function
+
+        coef_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            coefficient parameters of the ith layer in an iteration.
+
+        intercept_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            intercept parameters of the ith layer in an iteration.
+
+        Returns
+        -------
+        loss : float
+        coef_grads : list, length = n_layers - 1
+        intercept_grads : list, length = n_layers - 1
+        """
+        n_samples = X.shape[0]
+
+        # Forward propagate
+        activations = self._forward_pass(activations)
+
+        # Get loss
+        loss_func_name = self.loss
+        if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':
+            loss_func_name = 'binary_log_loss'
+        loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])
+        # Add L2 regularization term to loss
+        values = np.sum(
+            np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))
+        loss += (0.5 * self.alpha) * values / n_samples
+
+        # Backward propagate
+        last = self.n_layers_ - 2
+
+        # The calculation of delta[last] here works with following
+        # combinations of output activation and loss function:
+        # sigmoid and binary cross entropy, softmax and categorical cross
+        # entropy, and identity with squared loss
+        deltas[last] = activations[-1] - y
+
+        # Compute gradient for the last layer
+        coef_grads, intercept_grads = self._compute_loss_grad(
+            last, n_samples, activations, deltas, coef_grads, intercept_grads)
+
+        # Iterate over the hidden layers
+        for i in range(self.n_layers_ - 2, 0, -1):
+            deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
+            inplace_derivative = DERIVATIVES[self.activation]
+            inplace_derivative(activations[i], deltas[i - 1])
+
+            coef_grads, intercept_grads = self._compute_loss_grad(
+                i - 1, n_samples, activations, deltas, coef_grads,
+                intercept_grads)
+
+        return loss, coef_grads, intercept_grads
+
+    def _initialize(self, y, layer_units):
+        # set all attributes, allocate weights etc for first call
+        # Initialize parameters
+        self.n_iter_ = 0
+        self.t_ = 0
+        self.n_outputs_ = y.shape[1]
+
+        # Compute the number of layers
+        self.n_layers_ = len(layer_units)
+
+        # Output for regression
+        if not is_classifier(self):
+            self.out_activation_ = 'identity'
+        # Output for multi class
+        elif self._label_binarizer.y_type_ == 'multiclass':
+            self.out_activation_ = 'softmax'
+        # Output for binary class and multi-label
+        else:
+            self.out_activation_ = 'logistic'
+
+        # Initialize coefficient and intercept layers
+        self.coefs_ = []
+        self.intercepts_ = []
+
+        for i in range(self.n_layers_ - 1):
+            coef_init, intercept_init = self._init_coef(layer_units[i],
+                                                        layer_units[i + 1])
+            self.coefs_.append(coef_init)
+            self.intercepts_.append(intercept_init)
+
+        if self.solver in _STOCHASTIC_SOLVERS:
+            self.loss_curve_ = []
+            self._no_improvement_count = 0
+            if self.early_stopping:
+                self.validation_scores_ = []
+                self.best_validation_score_ = -np.inf
+            else:
+                self.best_loss_ = np.inf
+
+    def _init_coef(self, fan_in, fan_out):
+        # Use the initialization method recommended by
+        # Glorot et al.
+        factor = 6.
+        if self.activation == 'logistic':
+            factor = 2.
+        init_bound = np.sqrt(factor / (fan_in + fan_out))
+
+        # Generate weights and bias:
+        coef_init = self._random_state.uniform(-init_bound, init_bound,
+                                               (fan_in, fan_out))
+        intercept_init = self._random_state.uniform(-init_bound, init_bound,
+                                                    fan_out)
+        return coef_init, intercept_init
+
+    def _fit(self, X, y, incremental=False):
+        # Make sure self.hidden_layer_sizes is a list
+        hidden_layer_sizes = self.hidden_layer_sizes
+        if not hasattr(hidden_layer_sizes, "__iter__"):
+            hidden_layer_sizes = [hidden_layer_sizes]
+        hidden_layer_sizes = list(hidden_layer_sizes)
+
+        # Validate input parameters.
+        self._validate_hyperparameters()
+        if np.any(np.array(hidden_layer_sizes) <= 0):
+            raise ValueError("hidden_layer_sizes must be > 0, got %s." %
+                             hidden_layer_sizes)
+
+        X, y = self._validate_input(X, y, incremental)
+        n_samples, n_features = X.shape
+
+        # Ensure y is 2D
+        if y.ndim == 1:
+            y = y.reshape((-1, 1))
+
+        self.n_outputs_ = y.shape[1]
+
+        layer_units = ([n_features] + hidden_layer_sizes +
+                       [self.n_outputs_])
+
+        # check random state
+        self._random_state = check_random_state(self.random_state)
+
+        if not hasattr(self, 'coefs_') or (not self.warm_start and not
+                                           incremental):
+            # First time training the model
+            self._initialize(y, layer_units)
+
+        # lbfgs does not support mini-batches
+        if self.solver == 'lbfgs':
+            batch_size = n_samples
+        elif self.batch_size == 'auto':
+            batch_size = min(200, n_samples)
+        else:
+            if self.batch_size < 1 or self.batch_size > n_samples:
+                warnings.warn("Got `batch_size` less than 1 or larger than "
+                              "sample size. It is going to be clipped")
+            batch_size = np.clip(self.batch_size, 1, n_samples)
+
+        # Initialize lists
+        activations = [X] + [None] * (len(layer_units) - 1)
+        deltas = [None] * (len(activations) - 1)
+
+        coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_,
+                      n_fan_out_ in zip(layer_units[:-1],
+                                        layer_units[1:])]
+
+        intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in
+                           layer_units[1:]]
+
+        # Run the Stochastic optimization solver
+        if self.solver in _STOCHASTIC_SOLVERS:
+            self._fit_stochastic(X, y, activations, deltas, coef_grads,
+                                 intercept_grads, layer_units, incremental)
+
+        # Run the LBFGS solver
+        elif self.solver == 'lbfgs':
+            self._fit_lbfgs(X, y, activations, deltas, coef_grads,
+                            intercept_grads, layer_units)
+        return self
+
+    def _validate_hyperparameters(self):
+        if not isinstance(self.shuffle, bool):
+            raise ValueError("shuffle must be either True or False, got %s." %
+                             self.shuffle)
+        if self.max_iter <= 0:
+            raise ValueError("max_iter must be > 0, got %s." % self.max_iter)
+        if self.max_fun <= 0:
+            raise ValueError("max_fun must be > 0, got %s." % self.max_fun)
+        if self.alpha < 0.0:
+            raise ValueError("alpha must be >= 0, got %s." % self.alpha)
+        if (self.learning_rate in ["constant", "invscaling", "adaptive"] and
+                self.learning_rate_init <= 0.0):
+            raise ValueError("learning_rate_init must be > 0, got %s." %
+                             self.learning_rate)
+        if self.momentum > 1 or self.momentum < 0:
+            raise ValueError("momentum must be >= 0 and <= 1, got %s" %
+                             self.momentum)
+        if not isinstance(self.nesterovs_momentum, bool):
+            raise ValueError("nesterovs_momentum must be either True or False,"
+                             " got %s." % self.nesterovs_momentum)
+        if not isinstance(self.early_stopping, bool):
+            raise ValueError("early_stopping must be either True or False,"
+                             " got %s." % self.early_stopping)
+        if self.validation_fraction < 0 or self.validation_fraction >= 1:
+            raise ValueError("validation_fraction must be >= 0 and < 1, "
+                             "got %s" % self.validation_fraction)
+        if self.beta_1 < 0 or self.beta_1 >= 1:
+            raise ValueError("beta_1 must be >= 0 and < 1, got %s" %
+                             self.beta_1)
+        if self.beta_2 < 0 or self.beta_2 >= 1:
+            raise ValueError("beta_2 must be >= 0 and < 1, got %s" %
+                             self.beta_2)
+        if self.epsilon <= 0.0:
+            raise ValueError("epsilon must be > 0, got %s." % self.epsilon)
+        if self.n_iter_no_change <= 0:
+            raise ValueError("n_iter_no_change must be > 0, got %s."
+                             % self.n_iter_no_change)
+
+        # raise ValueError if not registered
+        if self.activation not in ACTIVATIONS:
+            raise ValueError("The activation '%s' is not supported. Supported "
+                             "activations are %s."
+                             % (self.activation, list(sorted(ACTIVATIONS))))
+        if self.learning_rate not in ["constant", "invscaling", "adaptive"]:
+            raise ValueError("learning rate %s is not supported. " %
+                             self.learning_rate)
+        supported_solvers = _STOCHASTIC_SOLVERS + ["lbfgs"]
+        if self.solver not in supported_solvers:
+            raise ValueError("The solver %s is not supported. "
+                             " Expected one of: %s" %
+                             (self.solver, ", ".join(supported_solvers)))
+
+    def _fit_lbfgs(self, X, y, activations, deltas, coef_grads,
+                   intercept_grads, layer_units):
+        # Store meta information for the parameters
+        self._coef_indptr = []
+        self._intercept_indptr = []
+        start = 0
+
+        # Save sizes and indices of coefficients for faster unpacking
+        for i in range(self.n_layers_ - 1):
+            n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]
+
+            end = start + (n_fan_in * n_fan_out)
+            self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
+            start = end
+
+        # Save sizes and indices of intercepts for faster unpacking
+        for i in range(self.n_layers_ - 1):
+            end = start + layer_units[i + 1]
+            self._intercept_indptr.append((start, end))
+            start = end
+
+        # Run LBFGS
+        packed_coef_inter = _pack(self.coefs_,
+                                  self.intercepts_)
+
+        if self.verbose is True or self.verbose >= 1:
+            iprint = 1
+        else:
+            iprint = -1
+
+        opt_res = scipy.optimize.minimize(
+                self._loss_grad_lbfgs, packed_coef_inter,
+                method="L-BFGS-B", jac=True,
+                options={
+                    "maxfun": self.max_fun,
+                    "maxiter": self.max_iter,
+                    "iprint": iprint,
+                    "gtol": self.tol
+                },
+                args=(X, y, activations, deltas, coef_grads, intercept_grads))
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
+        self.loss_ = opt_res.fun
+        self._unpack(opt_res.x)
+
+    def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
+                        intercept_grads, layer_units, incremental):
+
+        if not incremental or not hasattr(self, '_optimizer'):
+            params = self.coefs_ + self.intercepts_
+
+            if self.solver == 'sgd':
+                self._optimizer = SGDOptimizer(
+                    params, self.learning_rate_init, self.learning_rate,
+                    self.momentum, self.nesterovs_momentum, self.power_t)
+            elif self.solver == 'adam':
+                self._optimizer = AdamOptimizer(
+                    params, self.learning_rate_init, self.beta_1, self.beta_2,
+                    self.epsilon)
+
+        # early_stopping in partial_fit doesn't make sense
+        early_stopping = self.early_stopping and not incremental
+        if early_stopping:
+            # don't stratify in multilabel classification
+            should_stratify = is_classifier(self) and self.n_outputs_ == 1
+            stratify = y if should_stratify else None
+            X, X_val, y, y_val = train_test_split(
+                X, y, random_state=self._random_state,
+                test_size=self.validation_fraction,
+                stratify=stratify)
+            if is_classifier(self):
+                y_val = self._label_binarizer.inverse_transform(y_val)
+        else:
+            X_val = None
+            y_val = None
+
+        n_samples = X.shape[0]
+
+        if self.batch_size == 'auto':
+            batch_size = min(200, n_samples)
+        else:
+            batch_size = np.clip(self.batch_size, 1, n_samples)
+
+        try:
+            for it in range(self.max_iter):
+                if self.shuffle:
+                    X, y = shuffle(X, y, random_state=self._random_state)
+                accumulated_loss = 0.0
+                for batch_slice in gen_batches(n_samples, batch_size):
+                    activations[0] = X[batch_slice]
+                    batch_loss, coef_grads, intercept_grads = self._backprop(
+                        X[batch_slice], y[batch_slice], activations, deltas,
+                        coef_grads, intercept_grads)
+                    accumulated_loss += batch_loss * (batch_slice.stop -
+                                                      batch_slice.start)
+
+                    # update weights
+                    grads = coef_grads + intercept_grads
+                    self._optimizer.update_params(grads)
+
+                self.n_iter_ += 1
+                self.loss_ = accumulated_loss / X.shape[0]
+
+                self.t_ += n_samples
+                self.loss_curve_.append(self.loss_)
+                if self.verbose:
+                    print("Iteration %d, loss = %.8f" % (self.n_iter_,
+                                                         self.loss_))
+
+                # update no_improvement_count based on training loss or
+                # validation score according to early_stopping
+                self._update_no_improvement_count(early_stopping, X_val, y_val)
+
+                # for learning rate that needs to be updated at iteration end
+                self._optimizer.iteration_ends(self.t_)
+
+                if self._no_improvement_count > self.n_iter_no_change:
+                    # not better than last `n_iter_no_change` iterations by tol
+                    # stop or decrease learning rate
+                    if early_stopping:
+                        msg = ("Validation score did not improve more than "
+                               "tol=%f for %d consecutive epochs." % (
+                                   self.tol, self.n_iter_no_change))
+                    else:
+                        msg = ("Training loss did not improve more than tol=%f"
+                               " for %d consecutive epochs." % (
+                                   self.tol, self.n_iter_no_change))
+
+                    is_stopping = self._optimizer.trigger_stopping(
+                        msg, self.verbose)
+                    if is_stopping:
+                        break
+                    else:
+                        self._no_improvement_count = 0
+
+                if incremental:
+                    break
+
+                if self.n_iter_ == self.max_iter:
+                    warnings.warn(
+                        "Stochastic Optimizer: Maximum iterations (%d) "
+                        "reached and the optimization hasn't converged yet."
+                        % self.max_iter, ConvergenceWarning)
+        except KeyboardInterrupt:
+            warnings.warn("Training interrupted by user.")
+
+        if early_stopping:
+            # restore best weights
+            self.coefs_ = self._best_coefs
+            self.intercepts_ = self._best_intercepts
+
+    def _update_no_improvement_count(self, early_stopping, X_val, y_val):
+        if early_stopping:
+            # compute validation score, use that for stopping
+            self.validation_scores_.append(self.score(X_val, y_val))
+
+            if self.verbose:
+                print("Validation score: %f" % self.validation_scores_[-1])
+            # update best parameters
+            # use validation_scores_, not loss_curve_
+            # let's hope no-one overloads .score with mse
+            last_valid_score = self.validation_scores_[-1]
+
+            if last_valid_score < (self.best_validation_score_ +
+                                   self.tol):
+                self._no_improvement_count += 1
+            else:
+                self._no_improvement_count = 0
+
+            if last_valid_score > self.best_validation_score_:
+                self.best_validation_score_ = last_valid_score
+                self._best_coefs = [c.copy() for c in self.coefs_]
+                self._best_intercepts = [i.copy()
+                                         for i in self.intercepts_]
+        else:
+            if self.loss_curve_[-1] > self.best_loss_ - self.tol:
+                self._no_improvement_count += 1
+            else:
+                self._no_improvement_count = 0
+            if self.loss_curve_[-1] < self.best_loss_:
+                self.best_loss_ = self.loss_curve_[-1]
+
+    def fit(self, X, y):
+        """Fit the model to data matrix X and target(s) y.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            The input data.
+
+        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        return self._fit(X, y, incremental=False)
+
+    @property
+    def partial_fit(self):
+        """Update the model with a single iteration over the given data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        y : array-like, shape (n_samples,)
+            The target values.
+
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        if self.solver not in _STOCHASTIC_SOLVERS:
+            raise AttributeError("partial_fit is only available for stochastic"
+                                 " optimizers. %s is not stochastic."
+                                 % self.solver)
+        return self._partial_fit
+
+    def _partial_fit(self, X, y):
+        return self._fit(X, y, incremental=True)
+
+    def _predict(self, X):
+        """Predict using the trained model
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
+            The decision function of the samples for each class in the model.
+        """
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+
+        # Make sure self.hidden_layer_sizes is a list
+        hidden_layer_sizes = self.hidden_layer_sizes
+        if not hasattr(hidden_layer_sizes, "__iter__"):
+            hidden_layer_sizes = [hidden_layer_sizes]
+        hidden_layer_sizes = list(hidden_layer_sizes)
+
+        layer_units = [X.shape[1]] + hidden_layer_sizes + \
+            [self.n_outputs_]
+
+        # Initialize layers
+        activations = [X]
+
+        for i in range(self.n_layers_ - 1):
+            activations.append(np.empty((X.shape[0],
+                                         layer_units[i + 1])))
+        # forward propagate
+        self._forward_pass(activations)
+        y_pred = activations[-1]
+
+        return y_pred
+
+
+class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
+    """Multi-layer Perceptron classifier.
+
+    This model optimizes the log-loss function using LBFGS or stochastic
+    gradient descent.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
+        The ith element represents the number of neurons in the ith
+        hidden layer.
+
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+        Activation function for the hidden layer.
+
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+
+        - 'logistic', the logistic sigmoid function,
+          returns f(x) = 1 / (1 + exp(-x)).
+
+        - 'tanh', the hyperbolic tan function,
+          returns f(x) = tanh(x).
+
+        - 'relu', the rectified linear unit function,
+          returns f(x) = max(0, x)
+
+    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
+        The solver for weight optimization.
+
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
+
+        - 'sgd' refers to stochastic gradient descent.
+
+        - 'adam' refers to a stochastic gradient-based optimizer proposed
+          by Kingma, Diederik, and Jimmy Ba
+
+        Note: The default solver 'adam' works pretty well on relatively
+        large datasets (with thousands of training samples or more) in terms of
+        both training time and validation score.
+        For small datasets, however, 'lbfgs' can converge faster and perform
+        better.
+
+    alpha : float, optional, default 0.0001
+        L2 penalty (regularization term) parameter.
+
+    batch_size : int, optional, default 'auto'
+        Size of minibatches for stochastic optimizers.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
+        When set to "auto", `batch_size=min(200, n_samples)`
+
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
+        Learning rate schedule for weight updates.
+
+        - 'constant' is a constant learning rate given by
+          'learning_rate_init'.
+
+        - 'invscaling' gradually decreases the learning rate at each
+          time step 't' using an inverse scaling exponent of 'power_t'.
+          effective_learning_rate = learning_rate_init / pow(t, power_t)
+
+        - 'adaptive' keeps the learning rate constant to
+          'learning_rate_init' as long as training loss keeps decreasing.
+          Each time two consecutive epochs fail to decrease training loss by at
+          least tol, or fail to increase validation score by at least tol if
+          'early_stopping' is on, the current learning rate is divided by 5.
+
+        Only used when ``solver='sgd'``.
+
+    learning_rate_init : double, optional, default 0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+
+    power_t : double, optional, default 0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+
+    max_iter : int, optional, default 200
+        Maximum number of iterations. The solver iterates until convergence
+        (determined by 'tol') or this number of iterations. For stochastic
+        solvers ('sgd', 'adam'), note that this determines the number of epochs
+        (how many times each data point will be used), not the number of
+        gradient steps.
+
+    shuffle : bool, optional, default True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+
+    random_state : int, RandomState instance or None, optional, default None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    tol : float, optional, default 1e-4
+        Tolerance for the optimization. When the loss or score is not improving
+        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
+        unless ``learning_rate`` is set to 'adaptive', convergence is
+        considered to be reached and training stops.
+
+    verbose : bool, optional, default False
+        Whether to print progress messages to stdout.
+
+    warm_start : bool, optional, default False
+        When set to True, reuse the solution of the previous
+        call to fit as initialization, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+
+    momentum : float, default 0.9
+        Momentum for gradient descent update. Should be between 0 and 1. Only
+        used when solver='sgd'.
+
+    nesterovs_momentum : boolean, default True
+        Whether to use Nesterov's momentum. Only used when solver='sgd' and
+        momentum > 0.
+
+    early_stopping : bool, default False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to true, it will automatically set
+        aside 10% of training data as validation and terminate training when
+        validation score is not improving by at least tol for
+        ``n_iter_no_change`` consecutive epochs. The split is stratified,
+        except in a multilabel setting.
+        Only effective when solver='sgd' or 'adam'
+
+    validation_fraction : float, optional, default 0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True
+
+    beta_1 : float, optional, default 0.9
+        Exponential decay rate for estimates of first moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+
+    beta_2 : float, optional, default 0.999
+        Exponential decay rate for estimates of second moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+
+    epsilon : float, optional, default 1e-8
+        Value for numerical stability in adam. Only used when solver='adam'
+
+    n_iter_no_change : int, optional, default 10
+        Maximum number of epochs to not meet ``tol`` improvement.
+        Only effective when solver='sgd' or 'adam'
+
+        .. versionadded:: 0.20
+
+    max_fun : int, optional, default 15000
+        Only used when solver='lbfgs'. Maximum number of loss function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of loss function calls.
+        Note that number of loss function calls will be greater than or equal
+        to the number of iterations for the `MLPClassifier`.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    classes_ : array or list of array of shape (n_classes,)
+        Class labels for each output.
+
+    loss_ : float
+        The current loss computed with the loss function.
+
+    coefs_ : list, length n_layers - 1
+        The ith element in the list represents the weight matrix corresponding
+        to layer i.
+
+    intercepts_ : list, length n_layers - 1
+        The ith element in the list represents the bias vector corresponding to
+        layer i + 1.
+
+    n_iter_ : int,
+        The number of iterations the solver has ran.
+
+    n_layers_ : int
+        Number of layers.
+
+    n_outputs_ : int
+        Number of outputs.
+
+    out_activation_ : string
+        Name of the output activation function.
+
+    Notes
+    -----
+    MLPClassifier trains iteratively since at each time step
+    the partial derivatives of the loss function with respect to the model
+    parameters are computed to update the parameters.
+
+    It can also have a regularization term added to the loss function
+    that shrinks model parameters to prevent overfitting.
+
+    This implementation works with data represented as dense numpy arrays or
+    sparse scipy arrays of floating point values.
+
+    References
+    ----------
+    Hinton, Geoffrey E.
+        "Connectionist learning procedures." Artificial intelligence 40.1
+        (1989): 185-234.
+
+    Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of
+        training deep feedforward neural networks." International Conference
+        on Artificial Intelligence and Statistics. 2010.
+
+    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level
+        performance on imagenet classification." arXiv preprint
+        arXiv:1502.01852 (2015).
+
+    Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
+        optimization." arXiv preprint arXiv:1412.6980 (2014).
+    """
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
+                 solver='adam', alpha=0.0001,
+                 batch_size='auto', learning_rate="constant",
+                 learning_rate_init=0.001, power_t=0.5, max_iter=200,
+                 shuffle=True, random_state=None, tol=1e-4,
+                 verbose=False, warm_start=False, momentum=0.9,
+                 nesterovs_momentum=True, early_stopping=False,
+                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
+        super().__init__(
+            hidden_layer_sizes=hidden_layer_sizes,
+            activation=activation, solver=solver, alpha=alpha,
+            batch_size=batch_size, learning_rate=learning_rate,
+            learning_rate_init=learning_rate_init, power_t=power_t,
+            max_iter=max_iter, loss='log_loss', shuffle=shuffle,
+            random_state=random_state, tol=tol, verbose=verbose,
+            warm_start=warm_start, momentum=momentum,
+            nesterovs_momentum=nesterovs_momentum,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
+            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
+
+    def _validate_input(self, X, y, incremental):
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         multi_output=True)
+        if y.ndim == 2 and y.shape[1] == 1:
+            y = column_or_1d(y, warn=True)
+
+        if not incremental:
+            self._label_binarizer = LabelBinarizer()
+            self._label_binarizer.fit(y)
+            self.classes_ = self._label_binarizer.classes_
+        elif self.warm_start:
+            classes = unique_labels(y)
+            if set(classes) != set(self.classes_):
+                raise ValueError("warm_start can only be used where `y` has "
+                                 "the same classes as in the previous "
+                                 "call to fit. Previously got %s, `y` has %s" %
+                                 (self.classes_, classes))
+        else:
+            classes = unique_labels(y)
+            if len(np.setdiff1d(classes, self.classes_, assume_unique=True)):
+                raise ValueError("`y` has classes not in `self.classes_`."
+                                 " `self.classes_` has %s. 'y' has %s." %
+                                 (self.classes_, classes))
+
+        y = self._label_binarizer.transform(y)
+        return X, y
+
+    def predict(self, X):
+        """Predict using the multi-layer perceptron classifier
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y : array-like, shape (n_samples,) or (n_samples, n_classes)
+            The predicted classes.
+        """
+        check_is_fitted(self)
+        y_pred = self._predict(X)
+
+        if self.n_outputs_ == 1:
+            y_pred = y_pred.ravel()
+
+        return self._label_binarizer.inverse_transform(y_pred)
+
+    def fit(self, X, y):
+        """Fit the model to data matrix X and target(s) y.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            The input data.
+
+        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        return self._fit(X, y, incremental=(self.warm_start and
+                                            hasattr(self, "classes_")))
+
+    @property
+    def partial_fit(self):
+        """Update the model with a single iteration over the given data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        y : array-like, shape (n_samples,)
+            The target values.
+
+        classes : array, shape (n_classes), default None
+            Classes across all calls to partial_fit.
+            Can be obtained via `np.unique(y_all)`, where y_all is the
+            target vector of the entire dataset.
+            This argument is required for the first call to partial_fit
+            and can be omitted in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        if self.solver not in _STOCHASTIC_SOLVERS:
+            raise AttributeError("partial_fit is only available for stochastic"
+                                 " optimizer. %s is not stochastic"
+                                 % self.solver)
+        return self._partial_fit
+
+    def _partial_fit(self, X, y, classes=None):
+        if _check_partial_fit_first_call(self, classes):
+            self._label_binarizer = LabelBinarizer()
+            if type_of_target(y).startswith('multilabel'):
+                self._label_binarizer.fit(y)
+            else:
+                self._label_binarizer.fit(classes)
+
+        super()._partial_fit(X, y)
+
+        return self
+
+    def predict_log_proba(self, X):
+        """Return the log of probability estimates.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        log_y_prob : array-like, shape (n_samples, n_classes)
+            The predicted log-probability of the sample for each class
+            in the model, where classes are ordered as they are in
+            `self.classes_`. Equivalent to log(predict_proba(X))
+        """
+        y_prob = self.predict_proba(X)
+        return np.log(y_prob, out=y_prob)
+
+    def predict_proba(self, X):
+        """Probability estimates.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y_prob : array-like, shape (n_samples, n_classes)
+            The predicted probability of the sample for each class in the
+            model, where classes are ordered as they are in `self.classes_`.
+        """
+        check_is_fitted(self)
+        y_pred = self._predict(X)
+
+        if self.n_outputs_ == 1:
+            y_pred = y_pred.ravel()
+
+        if y_pred.ndim == 1:
+            return np.vstack([1 - y_pred, y_pred]).T
+        else:
+            return y_pred
+
+
+class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
+    """Multi-layer Perceptron regressor.
+
+    This model optimizes the squared-loss using LBFGS or stochastic gradient
+    descent.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
+        The ith element represents the number of neurons in the ith
+        hidden layer.
+
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+        Activation function for the hidden layer.
+
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+
+        - 'logistic', the logistic sigmoid function,
+          returns f(x) = 1 / (1 + exp(-x)).
+
+        - 'tanh', the hyperbolic tan function,
+          returns f(x) = tanh(x).
+
+        - 'relu', the rectified linear unit function,
+          returns f(x) = max(0, x)
+
+    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
+        The solver for weight optimization.
+
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
+
+        - 'sgd' refers to stochastic gradient descent.
+
+        - 'adam' refers to a stochastic gradient-based optimizer proposed by
+          Kingma, Diederik, and Jimmy Ba
+
+        Note: The default solver 'adam' works pretty well on relatively
+        large datasets (with thousands of training samples or more) in terms of
+        both training time and validation score.
+        For small datasets, however, 'lbfgs' can converge faster and perform
+        better.
+
+    alpha : float, optional, default 0.0001
+        L2 penalty (regularization term) parameter.
+
+    batch_size : int, optional, default 'auto'
+        Size of minibatches for stochastic optimizers.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
+        When set to "auto", `batch_size=min(200, n_samples)`
+
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
+        Learning rate schedule for weight updates.
+
+        - 'constant' is a constant learning rate given by
+          'learning_rate_init'.
+
+        - 'invscaling' gradually decreases the learning rate ``learning_rate_``
+          at each time step 't' using an inverse scaling exponent of 'power_t'.
+          effective_learning_rate = learning_rate_init / pow(t, power_t)
+
+        - 'adaptive' keeps the learning rate constant to
+          'learning_rate_init' as long as training loss keeps decreasing.
+          Each time two consecutive epochs fail to decrease training loss by at
+          least tol, or fail to increase validation score by at least tol if
+          'early_stopping' is on, the current learning rate is divided by 5.
+
+        Only used when solver='sgd'.
+
+    learning_rate_init : double, optional, default 0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+
+    power_t : double, optional, default 0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+
+    max_iter : int, optional, default 200
+        Maximum number of iterations. The solver iterates until convergence
+        (determined by 'tol') or this number of iterations. For stochastic
+        solvers ('sgd', 'adam'), note that this determines the number of epochs
+        (how many times each data point will be used), not the number of
+        gradient steps.
+
+    shuffle : bool, optional, default True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+
+    random_state : int, RandomState instance or None, optional, default None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    tol : float, optional, default 1e-4
+        Tolerance for the optimization. When the loss or score is not improving
+        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
+        unless ``learning_rate`` is set to 'adaptive', convergence is
+        considered to be reached and training stops.
+
+    verbose : bool, optional, default False
+        Whether to print progress messages to stdout.
+
+    warm_start : bool, optional, default False
+        When set to True, reuse the solution of the previous
+        call to fit as initialization, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+
+    momentum : float, default 0.9
+        Momentum for gradient descent update.  Should be between 0 and 1. Only
+        used when solver='sgd'.
+
+    nesterovs_momentum : boolean, default True
+        Whether to use Nesterov's momentum. Only used when solver='sgd' and
+        momentum > 0.
+
+    early_stopping : bool, default False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to true, it will automatically set
+        aside 10% of training data as validation and terminate training when
+        validation score is not improving by at least ``tol`` for
+        ``n_iter_no_change`` consecutive epochs.
+        Only effective when solver='sgd' or 'adam'
+
+    validation_fraction : float, optional, default 0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True
+
+    beta_1 : float, optional, default 0.9
+        Exponential decay rate for estimates of first moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+
+    beta_2 : float, optional, default 0.999
+        Exponential decay rate for estimates of second moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+
+    epsilon : float, optional, default 1e-8
+        Value for numerical stability in adam. Only used when solver='adam'
+
+    n_iter_no_change : int, optional, default 10
+        Maximum number of epochs to not meet ``tol`` improvement.
+        Only effective when solver='sgd' or 'adam'
+
+        .. versionadded:: 0.20
+
+    max_fun : int, optional, default 15000
+        Only used when solver='lbfgs'. Maximum number of function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of function calls.
+        Note that number of function calls will be greater than or equal to
+        the number of iterations for the MLPRegressor.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    loss_ : float
+        The current loss computed with the loss function.
+
+    coefs_ : list, length n_layers - 1
+        The ith element in the list represents the weight matrix corresponding
+        to layer i.
+
+    intercepts_ : list, length n_layers - 1
+        The ith element in the list represents the bias vector corresponding to
+        layer i + 1.
+
+    n_iter_ : int,
+        The number of iterations the solver has ran.
+
+    n_layers_ : int
+        Number of layers.
+
+    n_outputs_ : int
+        Number of outputs.
+
+    out_activation_ : string
+        Name of the output activation function.
+
+    Notes
+    -----
+    MLPRegressor trains iteratively since at each time step
+    the partial derivatives of the loss function with respect to the model
+    parameters are computed to update the parameters.
+
+    It can also have a regularization term added to the loss function
+    that shrinks model parameters to prevent overfitting.
+
+    This implementation works with data represented as dense and sparse numpy
+    arrays of floating point values.
+
+    References
+    ----------
+    Hinton, Geoffrey E.
+        "Connectionist learning procedures." Artificial intelligence 40.1
+        (1989): 185-234.
+
+    Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of
+        training deep feedforward neural networks." International Conference
+        on Artificial Intelligence and Statistics. 2010.
+
+    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level
+        performance on imagenet classification." arXiv preprint
+        arXiv:1502.01852 (2015).
+
+    Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
+        optimization." arXiv preprint arXiv:1412.6980 (2014).
+    """
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
+                 solver='adam', alpha=0.0001,
+                 batch_size='auto', learning_rate="constant",
+                 learning_rate_init=0.001,
+                 power_t=0.5, max_iter=200, shuffle=True,
+                 random_state=None, tol=1e-4,
+                 verbose=False, warm_start=False, momentum=0.9,
+                 nesterovs_momentum=True, early_stopping=False,
+                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
+        super().__init__(
+            hidden_layer_sizes=hidden_layer_sizes,
+            activation=activation, solver=solver, alpha=alpha,
+            batch_size=batch_size, learning_rate=learning_rate,
+            learning_rate_init=learning_rate_init, power_t=power_t,
+            max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
+            random_state=random_state, tol=tol, verbose=verbose,
+            warm_start=warm_start, momentum=momentum,
+            nesterovs_momentum=nesterovs_momentum,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
+            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
+
+    def predict(self, X):
+        """Predict using the multi-layer perceptron model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        y : array-like, shape (n_samples, n_outputs)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        y_pred = self._predict(X)
+        if y_pred.shape[1] == 1:
+            return y_pred.ravel()
+        return y_pred
+
+    def _validate_input(self, X, y, incremental):
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         multi_output=True, y_numeric=True)
+        if y.ndim == 2 and y.shape[1] == 1:
+            y = column_or_1d(y, warn=True)
+        return X, y
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
new file mode 100644
index 0000000000000..efe3aeda951af
--- /dev/null
+++ b/sklearn/neural_network/_rbm.py
@@ -0,0 +1,367 @@
+"""Restricted Boltzmann Machine
+"""
+
+# Authors: Yann N. Dauphin <dauphiya@iro.umontreal.ca>
+#          Vlad Niculae
+#          Gabriel Synnaeve
+#          Lars Buitinck
+# License: BSD 3 clause
+
+import time
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.special import expit  # logistic function
+
+from ..base import BaseEstimator
+from ..base import TransformerMixin
+from ..utils import check_array
+from ..utils import check_random_state
+from ..utils import gen_even_slices
+from ..utils.extmath import safe_sparse_dot
+from ..utils.extmath import log_logistic
+from ..utils.validation import check_is_fitted
+
+
+class BernoulliRBM(TransformerMixin, BaseEstimator):
+    """Bernoulli Restricted Boltzmann Machine (RBM).
+
+    A Restricted Boltzmann Machine with binary visible units and
+    binary hidden units. Parameters are estimated using Stochastic Maximum
+    Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)
+    [2].
+
+    The time complexity of this implementation is ``O(d ** 2)`` assuming
+    d ~ n_features ~ n_components.
+
+    Read more in the :ref:`User Guide <rbm>`.
+
+    Parameters
+    ----------
+    n_components : int, optional
+        Number of binary hidden units.
+
+    learning_rate : float, optional
+        The learning rate for weight updates. It is *highly* recommended
+        to tune this hyper-parameter. Reasonable values are in the
+        10**[0., -3.] range.
+
+    batch_size : int, optional
+        Number of examples per minibatch.
+
+    n_iter : int, optional
+        Number of iterations/sweeps over the training dataset to perform
+        during training.
+
+    verbose : int, optional
+        The verbosity level. The default, zero, means silent mode.
+
+    random_state : integer or RandomState, optional
+        A random number generator instance to define the state of the
+        random permutations generator. If an integer is given, it fixes the
+        seed. Defaults to the global numpy random number generator.
+
+    Attributes
+    ----------
+    intercept_hidden_ : array-like, shape (n_components,)
+        Biases of the hidden units.
+
+    intercept_visible_ : array-like, shape (n_features,)
+        Biases of the visible units.
+
+    components_ : array-like, shape (n_components, n_features)
+        Weight matrix, where n_features in the number of
+        visible units and n_components is the number of hidden units.
+
+    h_samples_ : array-like, shape (batch_size, n_components)
+        Hidden Activation sampled from the model distribution,
+        where batch_size in the number of examples per minibatch and
+        n_components is the number of hidden units.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from sklearn.neural_network import BernoulliRBM
+    >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
+    >>> model = BernoulliRBM(n_components=2)
+    >>> model.fit(X)
+    BernoulliRBM(n_components=2)
+
+    References
+    ----------
+
+    [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for
+        deep belief nets. Neural Computation 18, pp 1527-1554.
+        https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf
+
+    [2] Tieleman, T. Training Restricted Boltzmann Machines using
+        Approximations to the Likelihood Gradient. International Conference
+        on Machine Learning (ICML) 2008
+    """
+    def __init__(self, n_components=256, learning_rate=0.1, batch_size=10,
+                 n_iter=10, verbose=0, random_state=None):
+        self.n_components = n_components
+        self.learning_rate = learning_rate
+        self.batch_size = batch_size
+        self.n_iter = n_iter
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def transform(self, X):
+        """Compute the hidden layer activation probabilities, P(h=1|v=X).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} shape (n_samples, n_features)
+            The data to be transformed.
+
+        Returns
+        -------
+        h : array, shape (n_samples, n_components)
+            Latent representations of the data.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        return self._mean_hiddens(X)
+
+    def _mean_hiddens(self, v):
+        """Computes the probabilities P(h=1|v).
+
+        Parameters
+        ----------
+        v : array-like, shape (n_samples, n_features)
+            Values of the visible layer.
+
+        Returns
+        -------
+        h : array-like, shape (n_samples, n_components)
+            Corresponding mean field values for the hidden layer.
+        """
+        p = safe_sparse_dot(v, self.components_.T)
+        p += self.intercept_hidden_
+        return expit(p, out=p)
+
+    def _sample_hiddens(self, v, rng):
+        """Sample from the distribution P(h|v).
+
+        Parameters
+        ----------
+        v : array-like, shape (n_samples, n_features)
+            Values of the visible layer to sample from.
+
+        rng : RandomState
+            Random number generator to use.
+
+        Returns
+        -------
+        h : array-like, shape (n_samples, n_components)
+            Values of the hidden layer.
+        """
+        p = self._mean_hiddens(v)
+        return (rng.random_sample(size=p.shape) < p)
+
+    def _sample_visibles(self, h, rng):
+        """Sample from the distribution P(v|h).
+
+        Parameters
+        ----------
+        h : array-like, shape (n_samples, n_components)
+            Values of the hidden layer to sample from.
+
+        rng : RandomState
+            Random number generator to use.
+
+        Returns
+        -------
+        v : array-like, shape (n_samples, n_features)
+            Values of the visible layer.
+        """
+        p = np.dot(h, self.components_)
+        p += self.intercept_visible_
+        expit(p, out=p)
+        return (rng.random_sample(size=p.shape) < p)
+
+    def _free_energy(self, v):
+        """Computes the free energy F(v) = - log sum_h exp(-E(v,h)).
+
+        Parameters
+        ----------
+        v : array-like, shape (n_samples, n_features)
+            Values of the visible layer.
+
+        Returns
+        -------
+        free_energy : array-like, shape (n_samples,)
+            The value of the free energy.
+        """
+        return (- safe_sparse_dot(v, self.intercept_visible_)
+                - np.logaddexp(0, safe_sparse_dot(v, self.components_.T)
+                               + self.intercept_hidden_).sum(axis=1))
+
+    def gibbs(self, v):
+        """Perform one Gibbs sampling step.
+
+        Parameters
+        ----------
+        v : array-like, shape (n_samples, n_features)
+            Values of the visible layer to start from.
+
+        Returns
+        -------
+        v_new : array-like, shape (n_samples, n_features)
+            Values of the visible layer after one Gibbs step.
+        """
+        check_is_fitted(self)
+        if not hasattr(self, "random_state_"):
+            self.random_state_ = check_random_state(self.random_state)
+        h_ = self._sample_hiddens(v, self.random_state_)
+        v_ = self._sample_visibles(h_, self.random_state_)
+
+        return v_
+
+    def partial_fit(self, X, y=None):
+        """Fit the model to the data X which should contain a partial
+        segment of the data.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data.
+
+        Returns
+        -------
+        self : BernoulliRBM
+            The fitted model.
+        """
+        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        if not hasattr(self, 'random_state_'):
+            self.random_state_ = check_random_state(self.random_state)
+        if not hasattr(self, 'components_'):
+            self.components_ = np.asarray(
+                self.random_state_.normal(
+                    0,
+                    0.01,
+                    (self.n_components, X.shape[1])
+                ),
+                order='F')
+        if not hasattr(self, 'intercept_hidden_'):
+            self.intercept_hidden_ = np.zeros(self.n_components, )
+        if not hasattr(self, 'intercept_visible_'):
+            self.intercept_visible_ = np.zeros(X.shape[1], )
+        if not hasattr(self, 'h_samples_'):
+            self.h_samples_ = np.zeros((self.batch_size, self.n_components))
+
+        self._fit(X, self.random_state_)
+
+    def _fit(self, v_pos, rng):
+        """Inner fit for one mini-batch.
+
+        Adjust the parameters to maximize the likelihood of v using
+        Stochastic Maximum Likelihood (SML).
+
+        Parameters
+        ----------
+        v_pos : array-like, shape (n_samples, n_features)
+            The data to use for training.
+
+        rng : RandomState
+            Random number generator to use for sampling.
+        """
+        h_pos = self._mean_hiddens(v_pos)
+        v_neg = self._sample_visibles(self.h_samples_, rng)
+        h_neg = self._mean_hiddens(v_neg)
+
+        lr = float(self.learning_rate) / v_pos.shape[0]
+        update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T
+        update -= np.dot(h_neg.T, v_neg)
+        self.components_ += lr * update
+        self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))
+        self.intercept_visible_ += lr * (np.asarray(
+                                         v_pos.sum(axis=0)).squeeze() -
+                                         v_neg.sum(axis=0))
+
+        h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0  # sample binomial
+        self.h_samples_ = np.floor(h_neg, h_neg)
+
+    def score_samples(self, X):
+        """Compute the pseudo-likelihood of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} shape (n_samples, n_features)
+            Values of the visible layer. Must be all-boolean (not checked).
+
+        Returns
+        -------
+        pseudo_likelihood : array-like, shape (n_samples,)
+            Value of the pseudo-likelihood (proxy for likelihood).
+
+        Notes
+        -----
+        This method is not deterministic: it computes a quantity called the
+        free energy on X, then on a randomly corrupted version of X, and
+        returns the log of the logistic function of the difference.
+        """
+        check_is_fitted(self)
+
+        v = check_array(X, accept_sparse='csr')
+        rng = check_random_state(self.random_state)
+
+        # Randomly corrupt one feature in each sample in v.
+        ind = (np.arange(v.shape[0]),
+               rng.randint(0, v.shape[1], v.shape[0]))
+        if sp.issparse(v):
+            data = -2 * v[ind] + 1
+            v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)
+        else:
+            v_ = v.copy()
+            v_[ind] = 1 - v_[ind]
+
+        fe = self._free_energy(v)
+        fe_ = self._free_energy(v_)
+        return v.shape[1] * log_logistic(fe_ - fe)
+
+    def fit(self, X, y=None):
+        """Fit the model to the data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} shape (n_samples, n_features)
+            Training data.
+
+        Returns
+        -------
+        self : BernoulliRBM
+            The fitted model.
+        """
+        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        n_samples = X.shape[0]
+        rng = check_random_state(self.random_state)
+
+        self.components_ = np.asarray(
+            rng.normal(0, 0.01, (self.n_components, X.shape[1])),
+            order='F')
+        self.intercept_hidden_ = np.zeros(self.n_components, )
+        self.intercept_visible_ = np.zeros(X.shape[1], )
+        self.h_samples_ = np.zeros((self.batch_size, self.n_components))
+
+        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
+        batch_slices = list(gen_even_slices(n_batches * self.batch_size,
+                                            n_batches, n_samples))
+        verbose = self.verbose
+        begin = time.time()
+        for iteration in range(1, self.n_iter + 1):
+            for batch_slice in batch_slices:
+                self._fit(X[batch_slice], rng)
+
+            if verbose:
+                end = time.time()
+                print("[%s] Iteration %d, pseudo-likelihood = %.2f,"
+                      " time = %.2fs"
+                      % (type(self).__name__, iteration,
+                         self.score_samples(X).mean(), end - begin))
+                begin = end
+
+        return self
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index b6367d32e57a9..0eac8614fd699 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -1,1343 +1,14 @@
-"""Multi-layer Perceptron
-"""
-
-# Authors: Issam H. Laradji <issam.laradji@gmail.com>
-#          Andreas Mueller
-#          Jiyuan Qian
-# License: BSD 3 clause
-
-import numpy as np
-
-from abc import ABCMeta, abstractmethod
 import warnings
 
-import scipy.optimize
-
-from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
-from ..base import is_classifier
-from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
-from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer
-from ..model_selection import train_test_split
-from ..preprocessing import LabelBinarizer
-from ..utils import gen_batches, check_random_state
-from ..utils import shuffle
-from ..utils import check_array, check_X_y, column_or_1d
-from ..exceptions import ConvergenceWarning
-from ..utils.extmath import safe_sparse_dot
-from ..utils.validation import check_is_fitted
-from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
-from ..utils.multiclass import type_of_target
-from ..utils.optimize import _check_optimize_result
-
-
-_STOCHASTIC_SOLVERS = ['sgd', 'adam']
-
-
-def _pack(coefs_, intercepts_):
-    """Pack the parameters into a single vector."""
-    return np.hstack([l.ravel() for l in coefs_ + intercepts_])
-
-
-class BaseMultilayerPerceptron(BaseEstimator, metaclass=ABCMeta):
-    """Base class for MLP classification and regression.
-
-    Warning: This class should not be used directly.
-    Use derived classes instead.
-
-    .. versionadded:: 0.18
-    """
-
-    @abstractmethod
-    def __init__(self, hidden_layer_sizes, activation, solver,
-                 alpha, batch_size, learning_rate, learning_rate_init, power_t,
-                 max_iter, loss, shuffle, random_state, tol, verbose,
-                 warm_start, momentum, nesterovs_momentum, early_stopping,
-                 validation_fraction, beta_1, beta_2, epsilon,
-                 n_iter_no_change, max_fun):
-        self.activation = activation
-        self.solver = solver
-        self.alpha = alpha
-        self.batch_size = batch_size
-        self.learning_rate = learning_rate
-        self.learning_rate_init = learning_rate_init
-        self.power_t = power_t
-        self.max_iter = max_iter
-        self.loss = loss
-        self.hidden_layer_sizes = hidden_layer_sizes
-        self.shuffle = shuffle
-        self.random_state = random_state
-        self.tol = tol
-        self.verbose = verbose
-        self.warm_start = warm_start
-        self.momentum = momentum
-        self.nesterovs_momentum = nesterovs_momentum
-        self.early_stopping = early_stopping
-        self.validation_fraction = validation_fraction
-        self.beta_1 = beta_1
-        self.beta_2 = beta_2
-        self.epsilon = epsilon
-        self.n_iter_no_change = n_iter_no_change
-        self.max_fun = max_fun
-
-    def _unpack(self, packed_parameters):
-        """Extract the coefficients and intercepts from packed_parameters."""
-        for i in range(self.n_layers_ - 1):
-            start, end, shape = self._coef_indptr[i]
-            self.coefs_[i] = np.reshape(packed_parameters[start:end], shape)
-
-            start, end = self._intercept_indptr[i]
-            self.intercepts_[i] = packed_parameters[start:end]
-
-    def _forward_pass(self, activations):
-        """Perform a forward pass on the network by computing the values
-        of the neurons in the hidden layers and the output layer.
-
-        Parameters
-        ----------
-        activations : list, length = n_layers - 1
-            The ith element of the list holds the values of the ith layer.
-        """
-        hidden_activation = ACTIVATIONS[self.activation]
-        # Iterate over the hidden layers
-        for i in range(self.n_layers_ - 1):
-            activations[i + 1] = safe_sparse_dot(activations[i],
-                                                 self.coefs_[i])
-            activations[i + 1] += self.intercepts_[i]
-
-            # For the hidden layers
-            if (i + 1) != (self.n_layers_ - 1):
-                activations[i + 1] = hidden_activation(activations[i + 1])
-
-        # For the last layer
-        output_activation = ACTIVATIONS[self.out_activation_]
-        activations[i + 1] = output_activation(activations[i + 1])
-
-        return activations
-
-    def _compute_loss_grad(self, layer, n_samples, activations, deltas,
-                           coef_grads, intercept_grads):
-        """Compute the gradient of loss with respect to coefs and intercept for
-        specified layer.
-
-        This function does backpropagation for the specified one layer.
-        """
-        coef_grads[layer] = safe_sparse_dot(activations[layer].T,
-                                            deltas[layer])
-        coef_grads[layer] += (self.alpha * self.coefs_[layer])
-        coef_grads[layer] /= n_samples
-
-        intercept_grads[layer] = np.mean(deltas[layer], 0)
-
-        return coef_grads, intercept_grads
-
-    def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
-                         coef_grads, intercept_grads):
-        """Compute the MLP loss function and its corresponding derivatives
-        with respect to the different parameters given in the initialization.
-
-        Returned gradients are packed in a single vector so it can be used
-        in lbfgs
-
-        Parameters
-        ----------
-        packed_coef_inter : array-like
-            A vector comprising the flattened coefficients and intercepts.
-
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        y : array-like, shape (n_samples,)
-            The target values.
-
-        activations : list, length = n_layers - 1
-            The ith element of the list holds the values of the ith layer.
-
-        deltas : list, length = n_layers - 1
-            The ith element of the list holds the difference between the
-            activations of the i + 1 layer and the backpropagated error.
-            More specifically, deltas are gradients of loss with respect to z
-            in each layer, where z = wx + b is the value of a particular layer
-            before passing through the activation function
-
-        coef_grads : list, length = n_layers - 1
-            The ith element contains the amount of change used to update the
-            coefficient parameters of the ith layer in an iteration.
-
-        intercept_grads : list, length = n_layers - 1
-            The ith element contains the amount of change used to update the
-            intercept parameters of the ith layer in an iteration.
-
-        Returns
-        -------
-        loss : float
-        grad : array-like, shape (number of nodes of all layers,)
-        """
-        self._unpack(packed_coef_inter)
-        loss, coef_grads, intercept_grads = self._backprop(
-            X, y, activations, deltas, coef_grads, intercept_grads)
-        grad = _pack(coef_grads, intercept_grads)
-        return loss, grad
-
-    def _backprop(self, X, y, activations, deltas, coef_grads,
-                  intercept_grads):
-        """Compute the MLP loss function and its corresponding derivatives
-        with respect to each parameter: weights and bias vectors.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        y : array-like, shape (n_samples,)
-            The target values.
-
-        activations : list, length = n_layers - 1
-             The ith element of the list holds the values of the ith layer.
-
-        deltas : list, length = n_layers - 1
-            The ith element of the list holds the difference between the
-            activations of the i + 1 layer and the backpropagated error.
-            More specifically, deltas are gradients of loss with respect to z
-            in each layer, where z = wx + b is the value of a particular layer
-            before passing through the activation function
-
-        coef_grads : list, length = n_layers - 1
-            The ith element contains the amount of change used to update the
-            coefficient parameters of the ith layer in an iteration.
-
-        intercept_grads : list, length = n_layers - 1
-            The ith element contains the amount of change used to update the
-            intercept parameters of the ith layer in an iteration.
-
-        Returns
-        -------
-        loss : float
-        coef_grads : list, length = n_layers - 1
-        intercept_grads : list, length = n_layers - 1
-        """
-        n_samples = X.shape[0]
-
-        # Forward propagate
-        activations = self._forward_pass(activations)
-
-        # Get loss
-        loss_func_name = self.loss
-        if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':
-            loss_func_name = 'binary_log_loss'
-        loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])
-        # Add L2 regularization term to loss
-        values = np.sum(
-            np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))
-        loss += (0.5 * self.alpha) * values / n_samples
-
-        # Backward propagate
-        last = self.n_layers_ - 2
-
-        # The calculation of delta[last] here works with following
-        # combinations of output activation and loss function:
-        # sigmoid and binary cross entropy, softmax and categorical cross
-        # entropy, and identity with squared loss
-        deltas[last] = activations[-1] - y
-
-        # Compute gradient for the last layer
-        coef_grads, intercept_grads = self._compute_loss_grad(
-            last, n_samples, activations, deltas, coef_grads, intercept_grads)
-
-        # Iterate over the hidden layers
-        for i in range(self.n_layers_ - 2, 0, -1):
-            deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
-            inplace_derivative = DERIVATIVES[self.activation]
-            inplace_derivative(activations[i], deltas[i - 1])
-
-            coef_grads, intercept_grads = self._compute_loss_grad(
-                i - 1, n_samples, activations, deltas, coef_grads,
-                intercept_grads)
-
-        return loss, coef_grads, intercept_grads
-
-    def _initialize(self, y, layer_units):
-        # set all attributes, allocate weights etc for first call
-        # Initialize parameters
-        self.n_iter_ = 0
-        self.t_ = 0
-        self.n_outputs_ = y.shape[1]
-
-        # Compute the number of layers
-        self.n_layers_ = len(layer_units)
-
-        # Output for regression
-        if not is_classifier(self):
-            self.out_activation_ = 'identity'
-        # Output for multi class
-        elif self._label_binarizer.y_type_ == 'multiclass':
-            self.out_activation_ = 'softmax'
-        # Output for binary class and multi-label
-        else:
-            self.out_activation_ = 'logistic'
-
-        # Initialize coefficient and intercept layers
-        self.coefs_ = []
-        self.intercepts_ = []
-
-        for i in range(self.n_layers_ - 1):
-            coef_init, intercept_init = self._init_coef(layer_units[i],
-                                                        layer_units[i + 1])
-            self.coefs_.append(coef_init)
-            self.intercepts_.append(intercept_init)
-
-        if self.solver in _STOCHASTIC_SOLVERS:
-            self.loss_curve_ = []
-            self._no_improvement_count = 0
-            if self.early_stopping:
-                self.validation_scores_ = []
-                self.best_validation_score_ = -np.inf
-            else:
-                self.best_loss_ = np.inf
-
-    def _init_coef(self, fan_in, fan_out):
-        # Use the initialization method recommended by
-        # Glorot et al.
-        factor = 6.
-        if self.activation == 'logistic':
-            factor = 2.
-        init_bound = np.sqrt(factor / (fan_in + fan_out))
-
-        # Generate weights and bias:
-        coef_init = self._random_state.uniform(-init_bound, init_bound,
-                                               (fan_in, fan_out))
-        intercept_init = self._random_state.uniform(-init_bound, init_bound,
-                                                    fan_out)
-        return coef_init, intercept_init
-
-    def _fit(self, X, y, incremental=False):
-        # Make sure self.hidden_layer_sizes is a list
-        hidden_layer_sizes = self.hidden_layer_sizes
-        if not hasattr(hidden_layer_sizes, "__iter__"):
-            hidden_layer_sizes = [hidden_layer_sizes]
-        hidden_layer_sizes = list(hidden_layer_sizes)
-
-        # Validate input parameters.
-        self._validate_hyperparameters()
-        if np.any(np.array(hidden_layer_sizes) <= 0):
-            raise ValueError("hidden_layer_sizes must be > 0, got %s." %
-                             hidden_layer_sizes)
-
-        X, y = self._validate_input(X, y, incremental)
-        n_samples, n_features = X.shape
-
-        # Ensure y is 2D
-        if y.ndim == 1:
-            y = y.reshape((-1, 1))
-
-        self.n_outputs_ = y.shape[1]
-
-        layer_units = ([n_features] + hidden_layer_sizes +
-                       [self.n_outputs_])
-
-        # check random state
-        self._random_state = check_random_state(self.random_state)
-
-        if not hasattr(self, 'coefs_') or (not self.warm_start and not
-                                           incremental):
-            # First time training the model
-            self._initialize(y, layer_units)
-
-        # lbfgs does not support mini-batches
-        if self.solver == 'lbfgs':
-            batch_size = n_samples
-        elif self.batch_size == 'auto':
-            batch_size = min(200, n_samples)
-        else:
-            if self.batch_size < 1 or self.batch_size > n_samples:
-                warnings.warn("Got `batch_size` less than 1 or larger than "
-                              "sample size. It is going to be clipped")
-            batch_size = np.clip(self.batch_size, 1, n_samples)
-
-        # Initialize lists
-        activations = [X] + [None] * (len(layer_units) - 1)
-        deltas = [None] * (len(activations) - 1)
-
-        coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_,
-                      n_fan_out_ in zip(layer_units[:-1],
-                                        layer_units[1:])]
-
-        intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in
-                           layer_units[1:]]
-
-        # Run the Stochastic optimization solver
-        if self.solver in _STOCHASTIC_SOLVERS:
-            self._fit_stochastic(X, y, activations, deltas, coef_grads,
-                                 intercept_grads, layer_units, incremental)
-
-        # Run the LBFGS solver
-        elif self.solver == 'lbfgs':
-            self._fit_lbfgs(X, y, activations, deltas, coef_grads,
-                            intercept_grads, layer_units)
-        return self
-
-    def _validate_hyperparameters(self):
-        if not isinstance(self.shuffle, bool):
-            raise ValueError("shuffle must be either True or False, got %s." %
-                             self.shuffle)
-        if self.max_iter <= 0:
-            raise ValueError("max_iter must be > 0, got %s." % self.max_iter)
-        if self.max_fun <= 0:
-            raise ValueError("max_fun must be > 0, got %s." % self.max_fun)
-        if self.alpha < 0.0:
-            raise ValueError("alpha must be >= 0, got %s." % self.alpha)
-        if (self.learning_rate in ["constant", "invscaling", "adaptive"] and
-                self.learning_rate_init <= 0.0):
-            raise ValueError("learning_rate_init must be > 0, got %s." %
-                             self.learning_rate)
-        if self.momentum > 1 or self.momentum < 0:
-            raise ValueError("momentum must be >= 0 and <= 1, got %s" %
-                             self.momentum)
-        if not isinstance(self.nesterovs_momentum, bool):
-            raise ValueError("nesterovs_momentum must be either True or False,"
-                             " got %s." % self.nesterovs_momentum)
-        if not isinstance(self.early_stopping, bool):
-            raise ValueError("early_stopping must be either True or False,"
-                             " got %s." % self.early_stopping)
-        if self.validation_fraction < 0 or self.validation_fraction >= 1:
-            raise ValueError("validation_fraction must be >= 0 and < 1, "
-                             "got %s" % self.validation_fraction)
-        if self.beta_1 < 0 or self.beta_1 >= 1:
-            raise ValueError("beta_1 must be >= 0 and < 1, got %s" %
-                             self.beta_1)
-        if self.beta_2 < 0 or self.beta_2 >= 1:
-            raise ValueError("beta_2 must be >= 0 and < 1, got %s" %
-                             self.beta_2)
-        if self.epsilon <= 0.0:
-            raise ValueError("epsilon must be > 0, got %s." % self.epsilon)
-        if self.n_iter_no_change <= 0:
-            raise ValueError("n_iter_no_change must be > 0, got %s."
-                             % self.n_iter_no_change)
-
-        # raise ValueError if not registered
-        if self.activation not in ACTIVATIONS:
-            raise ValueError("The activation '%s' is not supported. Supported "
-                             "activations are %s."
-                             % (self.activation, list(sorted(ACTIVATIONS))))
-        if self.learning_rate not in ["constant", "invscaling", "adaptive"]:
-            raise ValueError("learning rate %s is not supported. " %
-                             self.learning_rate)
-        supported_solvers = _STOCHASTIC_SOLVERS + ["lbfgs"]
-        if self.solver not in supported_solvers:
-            raise ValueError("The solver %s is not supported. "
-                             " Expected one of: %s" %
-                             (self.solver, ", ".join(supported_solvers)))
-
-    def _fit_lbfgs(self, X, y, activations, deltas, coef_grads,
-                   intercept_grads, layer_units):
-        # Store meta information for the parameters
-        self._coef_indptr = []
-        self._intercept_indptr = []
-        start = 0
-
-        # Save sizes and indices of coefficients for faster unpacking
-        for i in range(self.n_layers_ - 1):
-            n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]
-
-            end = start + (n_fan_in * n_fan_out)
-            self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
-            start = end
-
-        # Save sizes and indices of intercepts for faster unpacking
-        for i in range(self.n_layers_ - 1):
-            end = start + layer_units[i + 1]
-            self._intercept_indptr.append((start, end))
-            start = end
-
-        # Run LBFGS
-        packed_coef_inter = _pack(self.coefs_,
-                                  self.intercepts_)
-
-        if self.verbose is True or self.verbose >= 1:
-            iprint = 1
-        else:
-            iprint = -1
-
-        opt_res = scipy.optimize.minimize(
-                self._loss_grad_lbfgs, packed_coef_inter,
-                method="L-BFGS-B", jac=True,
-                options={
-                    "maxfun": self.max_fun,
-                    "maxiter": self.max_iter,
-                    "iprint": iprint,
-                    "gtol": self.tol
-                },
-                args=(X, y, activations, deltas, coef_grads, intercept_grads))
-        self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
-        self.loss_ = opt_res.fun
-        self._unpack(opt_res.x)
-
-    def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
-                        intercept_grads, layer_units, incremental):
-
-        if not incremental or not hasattr(self, '_optimizer'):
-            params = self.coefs_ + self.intercepts_
-
-            if self.solver == 'sgd':
-                self._optimizer = SGDOptimizer(
-                    params, self.learning_rate_init, self.learning_rate,
-                    self.momentum, self.nesterovs_momentum, self.power_t)
-            elif self.solver == 'adam':
-                self._optimizer = AdamOptimizer(
-                    params, self.learning_rate_init, self.beta_1, self.beta_2,
-                    self.epsilon)
-
-        # early_stopping in partial_fit doesn't make sense
-        early_stopping = self.early_stopping and not incremental
-        if early_stopping:
-            # don't stratify in multilabel classification
-            should_stratify = is_classifier(self) and self.n_outputs_ == 1
-            stratify = y if should_stratify else None
-            X, X_val, y, y_val = train_test_split(
-                X, y, random_state=self._random_state,
-                test_size=self.validation_fraction,
-                stratify=stratify)
-            if is_classifier(self):
-                y_val = self._label_binarizer.inverse_transform(y_val)
-        else:
-            X_val = None
-            y_val = None
-
-        n_samples = X.shape[0]
-
-        if self.batch_size == 'auto':
-            batch_size = min(200, n_samples)
-        else:
-            batch_size = np.clip(self.batch_size, 1, n_samples)
-
-        try:
-            for it in range(self.max_iter):
-                if self.shuffle:
-                    X, y = shuffle(X, y, random_state=self._random_state)
-                accumulated_loss = 0.0
-                for batch_slice in gen_batches(n_samples, batch_size):
-                    activations[0] = X[batch_slice]
-                    batch_loss, coef_grads, intercept_grads = self._backprop(
-                        X[batch_slice], y[batch_slice], activations, deltas,
-                        coef_grads, intercept_grads)
-                    accumulated_loss += batch_loss * (batch_slice.stop -
-                                                      batch_slice.start)
-
-                    # update weights
-                    grads = coef_grads + intercept_grads
-                    self._optimizer.update_params(grads)
-
-                self.n_iter_ += 1
-                self.loss_ = accumulated_loss / X.shape[0]
-
-                self.t_ += n_samples
-                self.loss_curve_.append(self.loss_)
-                if self.verbose:
-                    print("Iteration %d, loss = %.8f" % (self.n_iter_,
-                                                         self.loss_))
-
-                # update no_improvement_count based on training loss or
-                # validation score according to early_stopping
-                self._update_no_improvement_count(early_stopping, X_val, y_val)
-
-                # for learning rate that needs to be updated at iteration end
-                self._optimizer.iteration_ends(self.t_)
-
-                if self._no_improvement_count > self.n_iter_no_change:
-                    # not better than last `n_iter_no_change` iterations by tol
-                    # stop or decrease learning rate
-                    if early_stopping:
-                        msg = ("Validation score did not improve more than "
-                               "tol=%f for %d consecutive epochs." % (
-                                   self.tol, self.n_iter_no_change))
-                    else:
-                        msg = ("Training loss did not improve more than tol=%f"
-                               " for %d consecutive epochs." % (
-                                   self.tol, self.n_iter_no_change))
-
-                    is_stopping = self._optimizer.trigger_stopping(
-                        msg, self.verbose)
-                    if is_stopping:
-                        break
-                    else:
-                        self._no_improvement_count = 0
-
-                if incremental:
-                    break
-
-                if self.n_iter_ == self.max_iter:
-                    warnings.warn(
-                        "Stochastic Optimizer: Maximum iterations (%d) "
-                        "reached and the optimization hasn't converged yet."
-                        % self.max_iter, ConvergenceWarning)
-        except KeyboardInterrupt:
-            warnings.warn("Training interrupted by user.")
-
-        if early_stopping:
-            # restore best weights
-            self.coefs_ = self._best_coefs
-            self.intercepts_ = self._best_intercepts
-
-    def _update_no_improvement_count(self, early_stopping, X_val, y_val):
-        if early_stopping:
-            # compute validation score, use that for stopping
-            self.validation_scores_.append(self.score(X_val, y_val))
-
-            if self.verbose:
-                print("Validation score: %f" % self.validation_scores_[-1])
-            # update best parameters
-            # use validation_scores_, not loss_curve_
-            # let's hope no-one overloads .score with mse
-            last_valid_score = self.validation_scores_[-1]
-
-            if last_valid_score < (self.best_validation_score_ +
-                                   self.tol):
-                self._no_improvement_count += 1
-            else:
-                self._no_improvement_count = 0
-
-            if last_valid_score > self.best_validation_score_:
-                self.best_validation_score_ = last_valid_score
-                self._best_coefs = [c.copy() for c in self.coefs_]
-                self._best_intercepts = [i.copy()
-                                         for i in self.intercepts_]
-        else:
-            if self.loss_curve_[-1] > self.best_loss_ - self.tol:
-                self._no_improvement_count += 1
-            else:
-                self._no_improvement_count = 0
-            if self.loss_curve_[-1] < self.best_loss_:
-                self.best_loss_ = self.loss_curve_[-1]
-
-    def fit(self, X, y):
-        """Fit the model to data matrix X and target(s) y.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
-            The input data.
-
-        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
-            The target values (class labels in classification, real numbers in
-            regression).
-
-        Returns
-        -------
-        self : returns a trained MLP model.
-        """
-        return self._fit(X, y, incremental=False)
-
-    @property
-    def partial_fit(self):
-        """Update the model with a single iteration over the given data.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        y : array-like, shape (n_samples,)
-            The target values.
-
-        Returns
-        -------
-        self : returns a trained MLP model.
-        """
-        if self.solver not in _STOCHASTIC_SOLVERS:
-            raise AttributeError("partial_fit is only available for stochastic"
-                                 " optimizers. %s is not stochastic."
-                                 % self.solver)
-        return self._partial_fit
-
-    def _partial_fit(self, X, y):
-        return self._fit(X, y, incremental=True)
-
-    def _predict(self, X):
-        """Predict using the trained model
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        Returns
-        -------
-        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
-            The decision function of the samples for each class in the model.
-        """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
-
-        # Make sure self.hidden_layer_sizes is a list
-        hidden_layer_sizes = self.hidden_layer_sizes
-        if not hasattr(hidden_layer_sizes, "__iter__"):
-            hidden_layer_sizes = [hidden_layer_sizes]
-        hidden_layer_sizes = list(hidden_layer_sizes)
-
-        layer_units = [X.shape[1]] + hidden_layer_sizes + \
-            [self.n_outputs_]
-
-        # Initialize layers
-        activations = [X]
-
-        for i in range(self.n_layers_ - 1):
-            activations.append(np.empty((X.shape[0],
-                                         layer_units[i + 1])))
-        # forward propagate
-        self._forward_pass(activations)
-        y_pred = activations[-1]
-
-        return y_pred
-
-
-class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
-    """Multi-layer Perceptron classifier.
-
-    This model optimizes the log-loss function using LBFGS or stochastic
-    gradient descent.
-
-    .. versionadded:: 0.18
-
-    Parameters
-    ----------
-    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
-        The ith element represents the number of neurons in the ith
-        hidden layer.
-
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
-        Activation function for the hidden layer.
-
-        - 'identity', no-op activation, useful to implement linear bottleneck,
-          returns f(x) = x
-
-        - 'logistic', the logistic sigmoid function,
-          returns f(x) = 1 / (1 + exp(-x)).
-
-        - 'tanh', the hyperbolic tan function,
-          returns f(x) = tanh(x).
-
-        - 'relu', the rectified linear unit function,
-          returns f(x) = max(0, x)
-
-    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
-        The solver for weight optimization.
-
-        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
-
-        - 'sgd' refers to stochastic gradient descent.
-
-        - 'adam' refers to a stochastic gradient-based optimizer proposed
-          by Kingma, Diederik, and Jimmy Ba
-
-        Note: The default solver 'adam' works pretty well on relatively
-        large datasets (with thousands of training samples or more) in terms of
-        both training time and validation score.
-        For small datasets, however, 'lbfgs' can converge faster and perform
-        better.
-
-    alpha : float, optional, default 0.0001
-        L2 penalty (regularization term) parameter.
-
-    batch_size : int, optional, default 'auto'
-        Size of minibatches for stochastic optimizers.
-        If the solver is 'lbfgs', the classifier will not use minibatch.
-        When set to "auto", `batch_size=min(200, n_samples)`
-
-    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
-        Learning rate schedule for weight updates.
-
-        - 'constant' is a constant learning rate given by
-          'learning_rate_init'.
-
-        - 'invscaling' gradually decreases the learning rate at each
-          time step 't' using an inverse scaling exponent of 'power_t'.
-          effective_learning_rate = learning_rate_init / pow(t, power_t)
-
-        - 'adaptive' keeps the learning rate constant to
-          'learning_rate_init' as long as training loss keeps decreasing.
-          Each time two consecutive epochs fail to decrease training loss by at
-          least tol, or fail to increase validation score by at least tol if
-          'early_stopping' is on, the current learning rate is divided by 5.
-
-        Only used when ``solver='sgd'``.
-
-    learning_rate_init : double, optional, default 0.001
-        The initial learning rate used. It controls the step-size
-        in updating the weights. Only used when solver='sgd' or 'adam'.
-
-    power_t : double, optional, default 0.5
-        The exponent for inverse scaling learning rate.
-        It is used in updating effective learning rate when the learning_rate
-        is set to 'invscaling'. Only used when solver='sgd'.
-
-    max_iter : int, optional, default 200
-        Maximum number of iterations. The solver iterates until convergence
-        (determined by 'tol') or this number of iterations. For stochastic
-        solvers ('sgd', 'adam'), note that this determines the number of epochs
-        (how many times each data point will be used), not the number of
-        gradient steps.
-
-    shuffle : bool, optional, default True
-        Whether to shuffle samples in each iteration. Only used when
-        solver='sgd' or 'adam'.
-
-    random_state : int, RandomState instance or None, optional, default None
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    tol : float, optional, default 1e-4
-        Tolerance for the optimization. When the loss or score is not improving
-        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
-        unless ``learning_rate`` is set to 'adaptive', convergence is
-        considered to be reached and training stops.
-
-    verbose : bool, optional, default False
-        Whether to print progress messages to stdout.
-
-    warm_start : bool, optional, default False
-        When set to True, reuse the solution of the previous
-        call to fit as initialization, otherwise, just erase the
-        previous solution. See :term:`the Glossary <warm_start>`.
-
-    momentum : float, default 0.9
-        Momentum for gradient descent update. Should be between 0 and 1. Only
-        used when solver='sgd'.
-
-    nesterovs_momentum : boolean, default True
-        Whether to use Nesterov's momentum. Only used when solver='sgd' and
-        momentum > 0.
-
-    early_stopping : bool, default False
-        Whether to use early stopping to terminate training when validation
-        score is not improving. If set to true, it will automatically set
-        aside 10% of training data as validation and terminate training when
-        validation score is not improving by at least tol for
-        ``n_iter_no_change`` consecutive epochs. The split is stratified,
-        except in a multilabel setting.
-        Only effective when solver='sgd' or 'adam'
-
-    validation_fraction : float, optional, default 0.1
-        The proportion of training data to set aside as validation set for
-        early stopping. Must be between 0 and 1.
-        Only used if early_stopping is True
-
-    beta_1 : float, optional, default 0.9
-        Exponential decay rate for estimates of first moment vector in adam,
-        should be in [0, 1). Only used when solver='adam'
-
-    beta_2 : float, optional, default 0.999
-        Exponential decay rate for estimates of second moment vector in adam,
-        should be in [0, 1). Only used when solver='adam'
-
-    epsilon : float, optional, default 1e-8
-        Value for numerical stability in adam. Only used when solver='adam'
-
-    n_iter_no_change : int, optional, default 10
-        Maximum number of epochs to not meet ``tol`` improvement.
-        Only effective when solver='sgd' or 'adam'
-
-        .. versionadded:: 0.20
-
-    max_fun : int, optional, default 15000
-        Only used when solver='lbfgs'. Maximum number of loss function calls.
-        The solver iterates until convergence (determined by 'tol'), number
-        of iterations reaches max_iter, or this number of loss function calls.
-        Note that number of loss function calls will be greater than or equal
-        to the number of iterations for the `MLPClassifier`.
-
-        .. versionadded:: 0.22
-
-    Attributes
-    ----------
-    classes_ : array or list of array of shape (n_classes,)
-        Class labels for each output.
-
-    loss_ : float
-        The current loss computed with the loss function.
-
-    coefs_ : list, length n_layers - 1
-        The ith element in the list represents the weight matrix corresponding
-        to layer i.
-
-    intercepts_ : list, length n_layers - 1
-        The ith element in the list represents the bias vector corresponding to
-        layer i + 1.
-
-    n_iter_ : int,
-        The number of iterations the solver has ran.
-
-    n_layers_ : int
-        Number of layers.
-
-    n_outputs_ : int
-        Number of outputs.
-
-    out_activation_ : string
-        Name of the output activation function.
-
-    Notes
-    -----
-    MLPClassifier trains iteratively since at each time step
-    the partial derivatives of the loss function with respect to the model
-    parameters are computed to update the parameters.
-
-    It can also have a regularization term added to the loss function
-    that shrinks model parameters to prevent overfitting.
-
-    This implementation works with data represented as dense numpy arrays or
-    sparse scipy arrays of floating point values.
-
-    References
-    ----------
-    Hinton, Geoffrey E.
-        "Connectionist learning procedures." Artificial intelligence 40.1
-        (1989): 185-234.
-
-    Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of
-        training deep feedforward neural networks." International Conference
-        on Artificial Intelligence and Statistics. 2010.
-
-    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level
-        performance on imagenet classification." arXiv preprint
-        arXiv:1502.01852 (2015).
-
-    Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
-        optimization." arXiv preprint arXiv:1412.6980 (2014).
-    """
-    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
-                 solver='adam', alpha=0.0001,
-                 batch_size='auto', learning_rate="constant",
-                 learning_rate_init=0.001, power_t=0.5, max_iter=200,
-                 shuffle=True, random_state=None, tol=1e-4,
-                 verbose=False, warm_start=False, momentum=0.9,
-                 nesterovs_momentum=True, early_stopping=False,
-                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
-        super().__init__(
-            hidden_layer_sizes=hidden_layer_sizes,
-            activation=activation, solver=solver, alpha=alpha,
-            batch_size=batch_size, learning_rate=learning_rate,
-            learning_rate_init=learning_rate_init, power_t=power_t,
-            max_iter=max_iter, loss='log_loss', shuffle=shuffle,
-            random_state=random_state, tol=tol, verbose=verbose,
-            warm_start=warm_start, momentum=momentum,
-            nesterovs_momentum=nesterovs_momentum,
-            early_stopping=early_stopping,
-            validation_fraction=validation_fraction,
-            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
-            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
-
-    def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         multi_output=True)
-        if y.ndim == 2 and y.shape[1] == 1:
-            y = column_or_1d(y, warn=True)
-
-        if not incremental:
-            self._label_binarizer = LabelBinarizer()
-            self._label_binarizer.fit(y)
-            self.classes_ = self._label_binarizer.classes_
-        elif self.warm_start:
-            classes = unique_labels(y)
-            if set(classes) != set(self.classes_):
-                raise ValueError("warm_start can only be used where `y` has "
-                                 "the same classes as in the previous "
-                                 "call to fit. Previously got %s, `y` has %s" %
-                                 (self.classes_, classes))
-        else:
-            classes = unique_labels(y)
-            if len(np.setdiff1d(classes, self.classes_, assume_unique=True)):
-                raise ValueError("`y` has classes not in `self.classes_`."
-                                 " `self.classes_` has %s. 'y' has %s." %
-                                 (self.classes_, classes))
-
-        y = self._label_binarizer.transform(y)
-        return X, y
-
-    def predict(self, X):
-        """Predict using the multi-layer perceptron classifier
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        Returns
-        -------
-        y : array-like, shape (n_samples,) or (n_samples, n_classes)
-            The predicted classes.
-        """
-        check_is_fitted(self)
-        y_pred = self._predict(X)
-
-        if self.n_outputs_ == 1:
-            y_pred = y_pred.ravel()
-
-        return self._label_binarizer.inverse_transform(y_pred)
-
-    def fit(self, X, y):
-        """Fit the model to data matrix X and target(s) y.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
-            The input data.
-
-        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
-            The target values (class labels in classification, real numbers in
-            regression).
-
-        Returns
-        -------
-        self : returns a trained MLP model.
-        """
-        return self._fit(X, y, incremental=(self.warm_start and
-                                            hasattr(self, "classes_")))
-
-    @property
-    def partial_fit(self):
-        """Update the model with a single iteration over the given data.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        y : array-like, shape (n_samples,)
-            The target values.
-
-        classes : array, shape (n_classes), default None
-            Classes across all calls to partial_fit.
-            Can be obtained via `np.unique(y_all)`, where y_all is the
-            target vector of the entire dataset.
-            This argument is required for the first call to partial_fit
-            and can be omitted in the subsequent calls.
-            Note that y doesn't need to contain all labels in `classes`.
-
-        Returns
-        -------
-        self : returns a trained MLP model.
-        """
-        if self.solver not in _STOCHASTIC_SOLVERS:
-            raise AttributeError("partial_fit is only available for stochastic"
-                                 " optimizer. %s is not stochastic"
-                                 % self.solver)
-        return self._partial_fit
-
-    def _partial_fit(self, X, y, classes=None):
-        if _check_partial_fit_first_call(self, classes):
-            self._label_binarizer = LabelBinarizer()
-            if type_of_target(y).startswith('multilabel'):
-                self._label_binarizer.fit(y)
-            else:
-                self._label_binarizer.fit(classes)
-
-        super()._partial_fit(X, y)
-
-        return self
-
-    def predict_log_proba(self, X):
-        """Return the log of probability estimates.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            The input data.
-
-        Returns
-        -------
-        log_y_prob : array-like, shape (n_samples, n_classes)
-            The predicted log-probability of the sample for each class
-            in the model, where classes are ordered as they are in
-            `self.classes_`. Equivalent to log(predict_proba(X))
-        """
-        y_prob = self.predict_proba(X)
-        return np.log(y_prob, out=y_prob)
-
-    def predict_proba(self, X):
-        """Probability estimates.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
-
-        Returns
-        -------
-        y_prob : array-like, shape (n_samples, n_classes)
-            The predicted probability of the sample for each class in the
-            model, where classes are ordered as they are in `self.classes_`.
-        """
-        check_is_fitted(self)
-        y_pred = self._predict(X)
-
-        if self.n_outputs_ == 1:
-            y_pred = y_pred.ravel()
-
-        if y_pred.ndim == 1:
-            return np.vstack([1 - y_pred, y_pred]).T
-        else:
-            return y_pred
-
-
-class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
-    """Multi-layer Perceptron regressor.
-
-    This model optimizes the squared-loss using LBFGS or stochastic gradient
-    descent.
-
-    .. versionadded:: 0.18
-
-    Parameters
-    ----------
-    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
-        The ith element represents the number of neurons in the ith
-        hidden layer.
-
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
-        Activation function for the hidden layer.
-
-        - 'identity', no-op activation, useful to implement linear bottleneck,
-          returns f(x) = x
-
-        - 'logistic', the logistic sigmoid function,
-          returns f(x) = 1 / (1 + exp(-x)).
-
-        - 'tanh', the hyperbolic tan function,
-          returns f(x) = tanh(x).
-
-        - 'relu', the rectified linear unit function,
-          returns f(x) = max(0, x)
-
-    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
-        The solver for weight optimization.
-
-        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
-
-        - 'sgd' refers to stochastic gradient descent.
-
-        - 'adam' refers to a stochastic gradient-based optimizer proposed by
-          Kingma, Diederik, and Jimmy Ba
-
-        Note: The default solver 'adam' works pretty well on relatively
-        large datasets (with thousands of training samples or more) in terms of
-        both training time and validation score.
-        For small datasets, however, 'lbfgs' can converge faster and perform
-        better.
-
-    alpha : float, optional, default 0.0001
-        L2 penalty (regularization term) parameter.
-
-    batch_size : int, optional, default 'auto'
-        Size of minibatches for stochastic optimizers.
-        If the solver is 'lbfgs', the classifier will not use minibatch.
-        When set to "auto", `batch_size=min(200, n_samples)`
-
-    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
-        Learning rate schedule for weight updates.
-
-        - 'constant' is a constant learning rate given by
-          'learning_rate_init'.
-
-        - 'invscaling' gradually decreases the learning rate ``learning_rate_``
-          at each time step 't' using an inverse scaling exponent of 'power_t'.
-          effective_learning_rate = learning_rate_init / pow(t, power_t)
-
-        - 'adaptive' keeps the learning rate constant to
-          'learning_rate_init' as long as training loss keeps decreasing.
-          Each time two consecutive epochs fail to decrease training loss by at
-          least tol, or fail to increase validation score by at least tol if
-          'early_stopping' is on, the current learning rate is divided by 5.
-
-        Only used when solver='sgd'.
-
-    learning_rate_init : double, optional, default 0.001
-        The initial learning rate used. It controls the step-size
-        in updating the weights. Only used when solver='sgd' or 'adam'.
-
-    power_t : double, optional, default 0.5
-        The exponent for inverse scaling learning rate.
-        It is used in updating effective learning rate when the learning_rate
-        is set to 'invscaling'. Only used when solver='sgd'.
-
-    max_iter : int, optional, default 200
-        Maximum number of iterations. The solver iterates until convergence
-        (determined by 'tol') or this number of iterations. For stochastic
-        solvers ('sgd', 'adam'), note that this determines the number of epochs
-        (how many times each data point will be used), not the number of
-        gradient steps.
-
-    shuffle : bool, optional, default True
-        Whether to shuffle samples in each iteration. Only used when
-        solver='sgd' or 'adam'.
-
-    random_state : int, RandomState instance or None, optional, default None
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    tol : float, optional, default 1e-4
-        Tolerance for the optimization. When the loss or score is not improving
-        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
-        unless ``learning_rate`` is set to 'adaptive', convergence is
-        considered to be reached and training stops.
-
-    verbose : bool, optional, default False
-        Whether to print progress messages to stdout.
-
-    warm_start : bool, optional, default False
-        When set to True, reuse the solution of the previous
-        call to fit as initialization, otherwise, just erase the
-        previous solution. See :term:`the Glossary <warm_start>`.
-
-    momentum : float, default 0.9
-        Momentum for gradient descent update.  Should be between 0 and 1. Only
-        used when solver='sgd'.
-
-    nesterovs_momentum : boolean, default True
-        Whether to use Nesterov's momentum. Only used when solver='sgd' and
-        momentum > 0.
-
-    early_stopping : bool, default False
-        Whether to use early stopping to terminate training when validation
-        score is not improving. If set to true, it will automatically set
-        aside 10% of training data as validation and terminate training when
-        validation score is not improving by at least ``tol`` for
-        ``n_iter_no_change`` consecutive epochs.
-        Only effective when solver='sgd' or 'adam'
-
-    validation_fraction : float, optional, default 0.1
-        The proportion of training data to set aside as validation set for
-        early stopping. Must be between 0 and 1.
-        Only used if early_stopping is True
-
-    beta_1 : float, optional, default 0.9
-        Exponential decay rate for estimates of first moment vector in adam,
-        should be in [0, 1). Only used when solver='adam'
-
-    beta_2 : float, optional, default 0.999
-        Exponential decay rate for estimates of second moment vector in adam,
-        should be in [0, 1). Only used when solver='adam'
-
-    epsilon : float, optional, default 1e-8
-        Value for numerical stability in adam. Only used when solver='adam'
-
-    n_iter_no_change : int, optional, default 10
-        Maximum number of epochs to not meet ``tol`` improvement.
-        Only effective when solver='sgd' or 'adam'
-
-        .. versionadded:: 0.20
-
-    max_fun : int, optional, default 15000
-        Only used when solver='lbfgs'. Maximum number of function calls.
-        The solver iterates until convergence (determined by 'tol'), number
-        of iterations reaches max_iter, or this number of function calls.
-        Note that number of function calls will be greater than or equal to
-        the number of iterations for the MLPRegressor.
-
-        .. versionadded:: 0.22
-
-    Attributes
-    ----------
-    loss_ : float
-        The current loss computed with the loss function.
-
-    coefs_ : list, length n_layers - 1
-        The ith element in the list represents the weight matrix corresponding
-        to layer i.
-
-    intercepts_ : list, length n_layers - 1
-        The ith element in the list represents the bias vector corresponding to
-        layer i + 1.
-
-    n_iter_ : int,
-        The number of iterations the solver has ran.
-
-    n_layers_ : int
-        Number of layers.
-
-    n_outputs_ : int
-        Number of outputs.
-
-    out_activation_ : string
-        Name of the output activation function.
-
-    Notes
-    -----
-    MLPRegressor trains iteratively since at each time step
-    the partial derivatives of the loss function with respect to the model
-    parameters are computed to update the parameters.
-
-    It can also have a regularization term added to the loss function
-    that shrinks model parameters to prevent overfitting.
-
-    This implementation works with data represented as dense and sparse numpy
-    arrays of floating point values.
-
-    References
-    ----------
-    Hinton, Geoffrey E.
-        "Connectionist learning procedures." Artificial intelligence 40.1
-        (1989): 185-234.
-
-    Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of
-        training deep feedforward neural networks." International Conference
-        on Artificial Intelligence and Statistics. 2010.
-
-    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level
-        performance on imagenet classification." arXiv preprint
-        arXiv:1502.01852 (2015).
-
-    Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
-        optimization." arXiv preprint arXiv:1412.6980 (2014).
-    """
-    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
-                 solver='adam', alpha=0.0001,
-                 batch_size='auto', learning_rate="constant",
-                 learning_rate_init=0.001,
-                 power_t=0.5, max_iter=200, shuffle=True,
-                 random_state=None, tol=1e-4,
-                 verbose=False, warm_start=False, momentum=0.9,
-                 nesterovs_momentum=True, early_stopping=False,
-                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
-        super().__init__(
-            hidden_layer_sizes=hidden_layer_sizes,
-            activation=activation, solver=solver, alpha=alpha,
-            batch_size=batch_size, learning_rate=learning_rate,
-            learning_rate_init=learning_rate_init, power_t=power_t,
-            max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
-            random_state=random_state, tol=tol, verbose=verbose,
-            warm_start=warm_start, momentum=momentum,
-            nesterovs_momentum=nesterovs_momentum,
-            early_stopping=early_stopping,
-            validation_fraction=validation_fraction,
-            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
-            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
-
-    def predict(self, X):
-        """Predict using the multi-layer perceptron model.
+from ._multilayer_perceptron import *  # noqa
 
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data.
 
-        Returns
-        -------
-        y : array-like, shape (n_samples, n_outputs)
-            The predicted values.
-        """
-        check_is_fitted(self)
-        y_pred = self._predict(X)
-        if y_pred.shape[1] == 1:
-            return y_pred.ravel()
-        return y_pred
+msg = ("The `sklearn.neural_nework.multilayer_perceptron` module is "
+       "deprecated in version 0.22 and will be removed in version 0.24. "
+       "The corresponding classes / functions "
+       "should instead be imported from sklearn.neural_nework. "
+       "Anything that cannot be imported from sklearn.neural_network is now "
+       "part of the private API.")
 
-    def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         multi_output=True, y_numeric=True)
-        if y.ndim == 2 and y.shape[1] == 1:
-            y = column_or_1d(y, warn=True)
-        return X, y
+# TODO: remove entire file in 0.24
+warnings.warn(msg, DeprecationWarning)
diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py
index efe3aeda951af..b7a3051af739c 100644
--- a/sklearn/neural_network/rbm.py
+++ b/sklearn/neural_network/rbm.py
@@ -1,367 +1,14 @@
-"""Restricted Boltzmann Machine
-"""
+import warnings
 
-# Authors: Yann N. Dauphin <dauphiya@iro.umontreal.ca>
-#          Vlad Niculae
-#          Gabriel Synnaeve
-#          Lars Buitinck
-# License: BSD 3 clause
+from ._rbm import *  # noqa
 
-import time
 
-import numpy as np
-import scipy.sparse as sp
-from scipy.special import expit  # logistic function
+msg = ("The `sklearn.neural_nework.rbm` module is  deprecated in version "
+       "0.22 and will be removed in version 0.24. "
+       "The corresponding classes / functions "
+       "should instead be imported from sklearn.neural_nework. "
+       "Anything that cannot be imported from sklearn.neural_network is now "
+       "part of the private API.")
 
-from ..base import BaseEstimator
-from ..base import TransformerMixin
-from ..utils import check_array
-from ..utils import check_random_state
-from ..utils import gen_even_slices
-from ..utils.extmath import safe_sparse_dot
-from ..utils.extmath import log_logistic
-from ..utils.validation import check_is_fitted
-
-
-class BernoulliRBM(TransformerMixin, BaseEstimator):
-    """Bernoulli Restricted Boltzmann Machine (RBM).
-
-    A Restricted Boltzmann Machine with binary visible units and
-    binary hidden units. Parameters are estimated using Stochastic Maximum
-    Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)
-    [2].
-
-    The time complexity of this implementation is ``O(d ** 2)`` assuming
-    d ~ n_features ~ n_components.
-
-    Read more in the :ref:`User Guide <rbm>`.
-
-    Parameters
-    ----------
-    n_components : int, optional
-        Number of binary hidden units.
-
-    learning_rate : float, optional
-        The learning rate for weight updates. It is *highly* recommended
-        to tune this hyper-parameter. Reasonable values are in the
-        10**[0., -3.] range.
-
-    batch_size : int, optional
-        Number of examples per minibatch.
-
-    n_iter : int, optional
-        Number of iterations/sweeps over the training dataset to perform
-        during training.
-
-    verbose : int, optional
-        The verbosity level. The default, zero, means silent mode.
-
-    random_state : integer or RandomState, optional
-        A random number generator instance to define the state of the
-        random permutations generator. If an integer is given, it fixes the
-        seed. Defaults to the global numpy random number generator.
-
-    Attributes
-    ----------
-    intercept_hidden_ : array-like, shape (n_components,)
-        Biases of the hidden units.
-
-    intercept_visible_ : array-like, shape (n_features,)
-        Biases of the visible units.
-
-    components_ : array-like, shape (n_components, n_features)
-        Weight matrix, where n_features in the number of
-        visible units and n_components is the number of hidden units.
-
-    h_samples_ : array-like, shape (batch_size, n_components)
-        Hidden Activation sampled from the model distribution,
-        where batch_size in the number of examples per minibatch and
-        n_components is the number of hidden units.
-
-    Examples
-    --------
-
-    >>> import numpy as np
-    >>> from sklearn.neural_network import BernoulliRBM
-    >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
-    >>> model = BernoulliRBM(n_components=2)
-    >>> model.fit(X)
-    BernoulliRBM(n_components=2)
-
-    References
-    ----------
-
-    [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for
-        deep belief nets. Neural Computation 18, pp 1527-1554.
-        https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf
-
-    [2] Tieleman, T. Training Restricted Boltzmann Machines using
-        Approximations to the Likelihood Gradient. International Conference
-        on Machine Learning (ICML) 2008
-    """
-    def __init__(self, n_components=256, learning_rate=0.1, batch_size=10,
-                 n_iter=10, verbose=0, random_state=None):
-        self.n_components = n_components
-        self.learning_rate = learning_rate
-        self.batch_size = batch_size
-        self.n_iter = n_iter
-        self.verbose = verbose
-        self.random_state = random_state
-
-    def transform(self, X):
-        """Compute the hidden layer activation probabilities, P(h=1|v=X).
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
-            The data to be transformed.
-
-        Returns
-        -------
-        h : array, shape (n_samples, n_components)
-            Latent representations of the data.
-        """
-        check_is_fitted(self)
-
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
-        return self._mean_hiddens(X)
-
-    def _mean_hiddens(self, v):
-        """Computes the probabilities P(h=1|v).
-
-        Parameters
-        ----------
-        v : array-like, shape (n_samples, n_features)
-            Values of the visible layer.
-
-        Returns
-        -------
-        h : array-like, shape (n_samples, n_components)
-            Corresponding mean field values for the hidden layer.
-        """
-        p = safe_sparse_dot(v, self.components_.T)
-        p += self.intercept_hidden_
-        return expit(p, out=p)
-
-    def _sample_hiddens(self, v, rng):
-        """Sample from the distribution P(h|v).
-
-        Parameters
-        ----------
-        v : array-like, shape (n_samples, n_features)
-            Values of the visible layer to sample from.
-
-        rng : RandomState
-            Random number generator to use.
-
-        Returns
-        -------
-        h : array-like, shape (n_samples, n_components)
-            Values of the hidden layer.
-        """
-        p = self._mean_hiddens(v)
-        return (rng.random_sample(size=p.shape) < p)
-
-    def _sample_visibles(self, h, rng):
-        """Sample from the distribution P(v|h).
-
-        Parameters
-        ----------
-        h : array-like, shape (n_samples, n_components)
-            Values of the hidden layer to sample from.
-
-        rng : RandomState
-            Random number generator to use.
-
-        Returns
-        -------
-        v : array-like, shape (n_samples, n_features)
-            Values of the visible layer.
-        """
-        p = np.dot(h, self.components_)
-        p += self.intercept_visible_
-        expit(p, out=p)
-        return (rng.random_sample(size=p.shape) < p)
-
-    def _free_energy(self, v):
-        """Computes the free energy F(v) = - log sum_h exp(-E(v,h)).
-
-        Parameters
-        ----------
-        v : array-like, shape (n_samples, n_features)
-            Values of the visible layer.
-
-        Returns
-        -------
-        free_energy : array-like, shape (n_samples,)
-            The value of the free energy.
-        """
-        return (- safe_sparse_dot(v, self.intercept_visible_)
-                - np.logaddexp(0, safe_sparse_dot(v, self.components_.T)
-                               + self.intercept_hidden_).sum(axis=1))
-
-    def gibbs(self, v):
-        """Perform one Gibbs sampling step.
-
-        Parameters
-        ----------
-        v : array-like, shape (n_samples, n_features)
-            Values of the visible layer to start from.
-
-        Returns
-        -------
-        v_new : array-like, shape (n_samples, n_features)
-            Values of the visible layer after one Gibbs step.
-        """
-        check_is_fitted(self)
-        if not hasattr(self, "random_state_"):
-            self.random_state_ = check_random_state(self.random_state)
-        h_ = self._sample_hiddens(v, self.random_state_)
-        v_ = self._sample_visibles(h_, self.random_state_)
-
-        return v_
-
-    def partial_fit(self, X, y=None):
-        """Fit the model to the data X which should contain a partial
-        segment of the data.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Training data.
-
-        Returns
-        -------
-        self : BernoulliRBM
-            The fitted model.
-        """
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
-        if not hasattr(self, 'random_state_'):
-            self.random_state_ = check_random_state(self.random_state)
-        if not hasattr(self, 'components_'):
-            self.components_ = np.asarray(
-                self.random_state_.normal(
-                    0,
-                    0.01,
-                    (self.n_components, X.shape[1])
-                ),
-                order='F')
-        if not hasattr(self, 'intercept_hidden_'):
-            self.intercept_hidden_ = np.zeros(self.n_components, )
-        if not hasattr(self, 'intercept_visible_'):
-            self.intercept_visible_ = np.zeros(X.shape[1], )
-        if not hasattr(self, 'h_samples_'):
-            self.h_samples_ = np.zeros((self.batch_size, self.n_components))
-
-        self._fit(X, self.random_state_)
-
-    def _fit(self, v_pos, rng):
-        """Inner fit for one mini-batch.
-
-        Adjust the parameters to maximize the likelihood of v using
-        Stochastic Maximum Likelihood (SML).
-
-        Parameters
-        ----------
-        v_pos : array-like, shape (n_samples, n_features)
-            The data to use for training.
-
-        rng : RandomState
-            Random number generator to use for sampling.
-        """
-        h_pos = self._mean_hiddens(v_pos)
-        v_neg = self._sample_visibles(self.h_samples_, rng)
-        h_neg = self._mean_hiddens(v_neg)
-
-        lr = float(self.learning_rate) / v_pos.shape[0]
-        update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T
-        update -= np.dot(h_neg.T, v_neg)
-        self.components_ += lr * update
-        self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))
-        self.intercept_visible_ += lr * (np.asarray(
-                                         v_pos.sum(axis=0)).squeeze() -
-                                         v_neg.sum(axis=0))
-
-        h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0  # sample binomial
-        self.h_samples_ = np.floor(h_neg, h_neg)
-
-    def score_samples(self, X):
-        """Compute the pseudo-likelihood of X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
-            Values of the visible layer. Must be all-boolean (not checked).
-
-        Returns
-        -------
-        pseudo_likelihood : array-like, shape (n_samples,)
-            Value of the pseudo-likelihood (proxy for likelihood).
-
-        Notes
-        -----
-        This method is not deterministic: it computes a quantity called the
-        free energy on X, then on a randomly corrupted version of X, and
-        returns the log of the logistic function of the difference.
-        """
-        check_is_fitted(self)
-
-        v = check_array(X, accept_sparse='csr')
-        rng = check_random_state(self.random_state)
-
-        # Randomly corrupt one feature in each sample in v.
-        ind = (np.arange(v.shape[0]),
-               rng.randint(0, v.shape[1], v.shape[0]))
-        if sp.issparse(v):
-            data = -2 * v[ind] + 1
-            v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)
-        else:
-            v_ = v.copy()
-            v_[ind] = 1 - v_[ind]
-
-        fe = self._free_energy(v)
-        fe_ = self._free_energy(v_)
-        return v.shape[1] * log_logistic(fe_ - fe)
-
-    def fit(self, X, y=None):
-        """Fit the model to the data X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
-            Training data.
-
-        Returns
-        -------
-        self : BernoulliRBM
-            The fitted model.
-        """
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
-        n_samples = X.shape[0]
-        rng = check_random_state(self.random_state)
-
-        self.components_ = np.asarray(
-            rng.normal(0, 0.01, (self.n_components, X.shape[1])),
-            order='F')
-        self.intercept_hidden_ = np.zeros(self.n_components, )
-        self.intercept_visible_ = np.zeros(X.shape[1], )
-        self.h_samples_ = np.zeros((self.batch_size, self.n_components))
-
-        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
-        batch_slices = list(gen_even_slices(n_batches * self.batch_size,
-                                            n_batches, n_samples))
-        verbose = self.verbose
-        begin = time.time()
-        for iteration in range(1, self.n_iter + 1):
-            for batch_slice in batch_slices:
-                self._fit(X[batch_slice], rng)
-
-            if verbose:
-                end = time.time()
-                print("[%s] Iteration %d, pseudo-likelihood = %.2f,"
-                      " time = %.2fs"
-                      % (type(self).__name__, iteration,
-                         self.score_samples(X).mean(), end - begin))
-                begin = end
-
-        return self
+# TODO: remove entire file in 0.24
+warnings.warn(msg, DeprecationWarning)
diff --git a/sklearn/neural_network/tests/test_import_deprecations.py b/sklearn/neural_network/tests/test_import_deprecations.py
new file mode 100644
index 0000000000000..94c230bac5ba5
--- /dev/null
+++ b/sklearn/neural_network/tests/test_import_deprecations.py
@@ -0,0 +1,28 @@
+import textwrap
+from sklearn.utils.testing import assert_run_python_script
+
+
+# This file makes sure importing from sklearn.neural_net.rbm or
+# sklearn.neural_net.multilayer_perceptron raises a deprecation warning.
+
+
+def test_rbm():
+    script = """
+    import pytest
+
+    with pytest.warns(DeprecationWarning,
+                      match="removed in version 0.24"):
+        from sklearn.neural_network.rbm import BernoulliRBM
+    """
+    assert_run_python_script(textwrap.dedent(script))
+
+
+def test_multilayer_perceptron():
+    script = """
+    import pytest
+
+    with pytest.warns(DeprecationWarning,
+                      match="removed in version 0.24"):
+        from sklearn.neural_network.multilayer_perceptron import MLPClassifier
+    """
+    assert_run_python_script(textwrap.dedent(script))