diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index 10e4f27f5490e..157083c010390 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -12,25 +12,25 @@ class Perceptron(BaseSGDClassifier):
     Parameters
     ----------
 
-    penalty : None, 'l2' or 'l1' or 'elasticnet'
-        The penalty (aka regularization term) to be used. Defaults to None.
+    penalty : {'l2','l1','elasticnet'}, default=None
+        The penalty (aka regularization term) to be used.
 
-    alpha : float
+    alpha : float, default=0.0001
         Constant that multiplies the regularization term if regularization is
-        used. Defaults to 0.0001
+        used.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
-        data is assumed to be already centered. Defaults to True.
+        data is assumed to be already centered.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > previous_loss - tol).
 
@@ -39,20 +39,20 @@ class Perceptron(BaseSGDClassifier):
     shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
 
-    verbose : integer, default=0
+    verbose : int, default=0
         The verbosity level
 
-    eta0 : double
-        Constant by which the updates are multiplied. Defaults to 1.
+    eta0 : double, default=1
+        Constant by which the updates are multiplied.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -80,7 +80,7 @@ class Perceptron(BaseSGDClassifier):
 
         .. versionadded:: 0.20
 
-    class_weight : dict, {class_label: weight} or "balanced" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
@@ -97,18 +97,18 @@ class Perceptron(BaseSGDClassifier):
 
     Attributes
     ----------
-    coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
-            n_features]
+    coef_ : ndarray of shape = [1, n_features] if n_classes == 2 else \
+        [n_classes, n_features]
         Weights assigned to the features.
 
-    intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
+    intercept_ : ndarray of shape = [1] if n_classes == 2 else [n_classes]
         Constants in decision function.
 
     n_iter_ : int
         The actual number of iterations to reach the stopping criterion.
         For multiclass fits, it is the maximum over every binary fit.
 
-    classes_ : array of shape (n_classes,)
+    classes_ : ndarray of shape (n_classes,)
         The unique classes labels.
 
     t_ : int
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 2a24fba4675a5..ec2f29dbb2317 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -245,11 +245,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
     Parameters
     ----------
-    X : {array-like, sparse matrix, LinearOperator} of shape \
+    X : {ndarray, sparse matrix, LinearOperator} of shape \
         (n_samples, n_features)
         Training data
 
-    y : array-like of shape (n_samples,) or (n_samples, n_targets)
+    y : ndarray of shape (n_samples,) or (n_samples, n_targets)
         Target values
 
     alpha : float or array-like of shape (n_targets,)
@@ -268,7 +268,8 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
         .. versionadded:: 0.17
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \
+        default='auto'
         Solver to use in the computational routines:
 
         - 'auto' chooses the solver automatically based on the type of data.
@@ -308,7 +309,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         .. versionadded:: 0.19
            SAGA solver.
 
-    max_iter : int, optional
+    max_iter : int, default=None
         Maximum number of iterations for conjugate gradient solver.
         For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
         by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
@@ -321,7 +322,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         Verbosity level. Setting verbose > 0 will display additional
         information depending on the solver used.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -349,14 +350,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
     Returns
     -------
-    coef : array of shape (n_features,) or (n_targets, n_features)
+    coef : ndarray of shape (n_features,) or (n_targets, n_features)
         Weight vector(s).
 
     n_iter : int, optional
         The actual number of iteration performed by the solver.
         Only returned if `return_n_iter` is True.
 
-    intercept : float or array of shape (n_targets,)
+    intercept : float or ndarray of shape (n_targets,)
         The intercept of the model. Only returned if `return_intercept`
         is True and if X is a scipy sparse array.
 
@@ -618,7 +619,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
 
     Parameters
     ----------
-    alpha : {float, array-like of shape (n_targets,)}, default=1.0
+    alpha : {float, ndarray of shape (n_targets,)}, default=1.0
         Regularization strength; must be a positive float. Regularization
         improves the conditioning of the problem and reduces the variance of
         the estimates. Larger values specify stronger regularization.
@@ -643,7 +644,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
-    max_iter : int, optional
+    max_iter : int, default=None
         Maximum number of iterations for conjugate gradient solver.
         For 'sparse_cg' and 'lsqr' solvers, the default value is determined
         by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
@@ -651,7 +652,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     tol : float, default=1e-3
         Precision of the solution.
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \
+        default='auto'
         Solver to use in the computational routines:
 
         - 'auto' chooses the solver automatically based on the type of data.
@@ -688,7 +690,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
         .. versionadded:: 0.19
            SAGA solver.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -700,14 +702,14 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
 
     Attributes
     ----------
-    coef_ : array of shape (n_features,) or (n_targets, n_features)
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
         Weight vector(s).
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
-    n_iter_ : None or array of shape (n_targets,)
+    n_iter_ : None or ndarray of shape (n_targets,)
         Actual number of iterations for each target. Available only for
         sag and lsqr solvers. Other solvers will return None.
 
@@ -747,13 +749,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Training data
 
-        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target values
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -798,14 +800,14 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
-    max_iter : int, optional
+    max_iter : int, default=None
         Maximum number of iterations for conjugate gradient solver.
         The default value is determined by scipy.sparse.linalg.
 
     tol : float, default=1e-3
         Precision of the solution.
 
-    class_weight : dict or 'balanced', optional
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -813,7 +815,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``.
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \
+        default='auto'
         Solver to use in the computational routines:
 
         - 'auto' chooses the solver automatically based on the type of data.
@@ -847,7 +850,7 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
           .. versionadded:: 0.19
            SAGA solver.
 
-    random_state : int, RandomState instance or None, default=None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -856,20 +859,20 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
 
     Attributes
     ----------
-    coef_ : array of shape (1, n_features) or (n_classes, n_features)
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
         Coefficient of the features in the decision function.
 
         ``coef_`` is of shape (1, n_features) when the given problem is binary.
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
-    n_iter_ : None or array of shape (n_targets,)
+    n_iter_ : None or ndarray of shape (n_targets,)
         Actual number of iterations for each target. Available only for
         sag and lsqr solvers. Other solvers will return None.
 
-    classes_ : array of shape (n_classes,)
+    classes_ : ndarray of shape (n_classes,)
         The classes labels.
 
     See Also
@@ -907,13 +910,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Training data.
 
-        y : array-like of shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -1135,7 +1138,7 @@ def _compute_gram(self, X, sqrt_sw):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             The preprocessed design matrix.
 
         sqrt_sw : ndarray of shape (n_samples,)
@@ -1425,13 +1428,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Training data. Will be cast to float64 if necessary.
 
-        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target values. Will be cast to float64 if necessary.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -1543,14 +1546,14 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training data. If using GCV, will be cast to float64
             if necessary.
 
-        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target values. Will be cast to X's dtype if necessary.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -1634,14 +1637,14 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    scoring : string, callable or None, default=None
+    scoring : string, callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``.
         If None, the negative mean squared error if cv is 'auto' or None
         (i.e. when using generalized cross-validation), and r2 score otherwise.
 
-    cv : int, cross-validation generator or an iterable, optional
+    cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1658,7 +1661,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-    gcv_mode : {None, 'auto', 'svd', eigen'}, optional
+    gcv_mode : {'auto', 'svd', eigen'}, default='auto'
         Flag indicating which strategy to use when performing
         Generalized Cross-Validation. Options are::
 
@@ -1670,7 +1673,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         The 'auto' mode is the default and is intended to pick the cheaper
         option of the two depending on the shape of the training data.
 
-    store_cv_values : boolean, default=False
+    store_cv_values : bool, default=False
         Flag indicating if the cross-validation values corresponding to
         each alpha should be stored in the ``cv_values_`` attribute (see
         below). This flag is only compatible with ``cv=None`` (i.e. using
@@ -1678,17 +1681,17 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
 
     Attributes
     ----------
-    cv_values_ : array of shape (n_samples, n_alphas) or \
+    cv_values_ : ndarray of shape (n_samples, n_alphas) or \
         shape (n_samples, n_targets, n_alphas), optional
         Cross-validation values for each alpha (if ``store_cv_values=True``\
         and ``cv=None``). After ``fit()`` has been called, this attribute \
         will contain the mean squared errors (by default) or the values \
         of the ``{loss,score}_func`` function (if provided in the constructor).
 
-    coef_ : array of shape (n_features) or (n_targets, n_features)
+    coef_ : ndarray of shape (n_features) or (n_targets, n_features)
         Weight vector(s).
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
@@ -1750,12 +1753,12 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    scoring : string, callable or None, default=None
+    scoring : string, callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``.
 
-    cv : int, cross-validation generator or an iterable, optional
+    cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1767,7 +1770,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-    class_weight : dict or 'balanced', optional
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -1775,7 +1778,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``
 
-    store_cv_values : boolean, default=False
+    store_cv_values : bool, default=False
         Flag indicating if the cross-validation values corresponding to
         each alpha should be stored in the ``cv_values_`` attribute (see
         below). This flag is only compatible with ``cv=None`` (i.e. using
@@ -1783,19 +1786,19 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
 
     Attributes
     ----------
-    cv_values_ : array of shape (n_samples, n_targets, n_alphas), optional
+    cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional
         Cross-validation values for each alpha (if ``store_cv_values=True`` and
         ``cv=None``). After ``fit()`` has been called, this attribute will
         contain the mean squared errors (by default) or the values of the
         ``{loss,score}_func`` function (if provided in the constructor). This
         attribute exists only when ``store_cv_values`` is True.
 
-    coef_ : array of shape (1, n_features) or (n_targets, n_features)
+    coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)
         Coefficient of the features in the decision function.
 
         ``coef_`` is of shape (1, n_features) when the given problem is binary.
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
@@ -1805,7 +1808,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     best_score_ : float
         Score of base estimator with best alpha.
 
-    classes_ : array of shape (n_classes,)
+    classes_ : ndarray of shape (n_classes,)
         The classes labels.
 
     Examples
@@ -1843,15 +1846,15 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training vectors, where n_samples is the number of samples
             and n_features is the number of features. When using GCV,
             will be cast to float64 if necessary.
 
-        y : array-like of shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values. Will be cast to X's dtype if necessary.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index b6367d32e57a9..51af0e33139dd 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -140,13 +140,13 @@ def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
 
         Parameters
         ----------
-        packed_coef_inter : array-like
+        packed_coef_inter : ndarray
             A vector comprising the flattened coefficients and intercepts.
 
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             The target values.
 
         activations : list, length = n_layers - 1
@@ -185,10 +185,10 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             The target values.
 
         activations : list, length = n_layers - 1
@@ -613,10 +613,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
             The target values (class labels in classification, real numbers in
             regression).
 
@@ -632,10 +632,10 @@ def partial_fit(self):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             The target values.
 
         Returns
@@ -656,12 +656,12 @@ def _predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
             The decision function of the samples for each class in the model.
         """
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
@@ -698,11 +698,11 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
     Parameters
     ----------
-    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
+    hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
         Activation function for the hidden layer.
 
         - 'identity', no-op activation, useful to implement linear bottleneck,
@@ -717,7 +717,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
 
-    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
         The solver for weight optimization.
 
         - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
@@ -733,15 +733,15 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         For small datasets, however, 'lbfgs' can converge faster and perform
         better.
 
-    alpha : float, optional, default 0.0001
+    alpha : float, default=0.0001
         L2 penalty (regularization term) parameter.
 
-    batch_size : int, optional, default 'auto'
+    batch_size : int, default='auto'
         Size of minibatches for stochastic optimizers.
         If the solver is 'lbfgs', the classifier will not use minibatch.
         When set to "auto", `batch_size=min(200, n_samples)`
 
-    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
         Learning rate schedule for weight updates.
 
         - 'constant' is a constant learning rate given by
@@ -759,55 +759,55 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
         Only used when ``solver='sgd'``.
 
-    learning_rate_init : double, optional, default 0.001
+    learning_rate_init : double, default=0.001
         The initial learning rate used. It controls the step-size
         in updating the weights. Only used when solver='sgd' or 'adam'.
 
-    power_t : double, optional, default 0.5
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate.
         It is used in updating effective learning rate when the learning_rate
         is set to 'invscaling'. Only used when solver='sgd'.
 
-    max_iter : int, optional, default 200
+    max_iter : int, default=200
         Maximum number of iterations. The solver iterates until convergence
         (determined by 'tol') or this number of iterations. For stochastic
         solvers ('sgd', 'adam'), note that this determines the number of epochs
         (how many times each data point will be used), not the number of
         gradient steps.
 
-    shuffle : bool, optional, default True
+    shuffle : bool, default=True
         Whether to shuffle samples in each iteration. Only used when
         solver='sgd' or 'adam'.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance or None, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    tol : float, optional, default 1e-4
+    tol : float, default=1e-4
         Tolerance for the optimization. When the loss or score is not improving
         by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
         unless ``learning_rate`` is set to 'adaptive', convergence is
         considered to be reached and training stops.
 
-    verbose : bool, optional, default False
+    verbose : bool, default=False
         Whether to print progress messages to stdout.
 
-    warm_start : bool, optional, default False
+    warm_start : bool, default=False
         When set to True, reuse the solution of the previous
         call to fit as initialization, otherwise, just erase the
         previous solution. See :term:`the Glossary <warm_start>`.
 
-    momentum : float, default 0.9
+    momentum : float, default=0.9
         Momentum for gradient descent update. Should be between 0 and 1. Only
         used when solver='sgd'.
 
-    nesterovs_momentum : boolean, default True
+    nesterovs_momentum : boolean, default=True
         Whether to use Nesterov's momentum. Only used when solver='sgd' and
         momentum > 0.
 
-    early_stopping : bool, default False
+    early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to true, it will automatically set
         aside 10% of training data as validation and terminate training when
@@ -816,29 +816,29 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         except in a multilabel setting.
         Only effective when solver='sgd' or 'adam'
 
-    validation_fraction : float, optional, default 0.1
+    validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
         Only used if early_stopping is True
 
-    beta_1 : float, optional, default 0.9
+    beta_1 : float, default=0.9
         Exponential decay rate for estimates of first moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    beta_2 : float, optional, default 0.999
+    beta_2 : float, default=0.999
         Exponential decay rate for estimates of second moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    epsilon : float, optional, default 1e-8
+    epsilon : float, default=1e-8
         Value for numerical stability in adam. Only used when solver='adam'
 
-    n_iter_no_change : int, optional, default 10
+    n_iter_no_change : int, default=10
         Maximum number of epochs to not meet ``tol`` improvement.
         Only effective when solver='sgd' or 'adam'
 
         .. versionadded:: 0.20
 
-    max_fun : int, optional, default 15000
+    max_fun : int, default=15000
         Only used when solver='lbfgs'. Maximum number of loss function calls.
         The solver iterates until convergence (determined by 'tol'), number
         of iterations reaches max_iter, or this number of loss function calls.
@@ -849,7 +849,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
     Attributes
     ----------
-    classes_ : array or list of array of shape (n_classes,)
+    classes_ : ndarray or list of ndarray of shape (n_classes,)
         Class labels for each output.
 
     loss_ : float
@@ -959,12 +959,12 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y : array-like, shape (n_samples,) or (n_samples, n_classes)
+        y : ndarray, shape (n_samples,) or (n_samples, n_classes)
             The predicted classes.
         """
         check_is_fitted(self)
@@ -980,10 +980,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
+        y : ndarray, shape (n_samples,) or (n_samples, n_outputs)
             The target values (class labels in classification, real numbers in
             regression).
 
@@ -1041,12 +1041,12 @@ def predict_log_proba(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        log_y_prob : array-like, shape (n_samples, n_classes)
+        log_y_prob : ndarray of shape (n_samples, n_classes)
             The predicted log-probability of the sample for each class
             in the model, where classes are ordered as they are in
             `self.classes_`. Equivalent to log(predict_proba(X))
@@ -1059,12 +1059,12 @@ def predict_proba(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y_prob : array-like, shape (n_samples, n_classes)
+        y_prob : ndarray of shape (n_samples, n_classes)
             The predicted probability of the sample for each class in the
             model, where classes are ordered as they are in `self.classes_`.
         """
@@ -1090,11 +1090,11 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
 
     Parameters
     ----------
-    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
+    hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
         Activation function for the hidden layer.
 
         - 'identity', no-op activation, useful to implement linear bottleneck,
@@ -1109,7 +1109,7 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
 
-    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
         The solver for weight optimization.
 
         - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
@@ -1125,15 +1125,15 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         For small datasets, however, 'lbfgs' can converge faster and perform
         better.
 
-    alpha : float, optional, default 0.0001
+    alpha : float, default=0.0001
         L2 penalty (regularization term) parameter.
 
-    batch_size : int, optional, default 'auto'
+    batch_size : int, default='auto'
         Size of minibatches for stochastic optimizers.
         If the solver is 'lbfgs', the classifier will not use minibatch.
         When set to "auto", `batch_size=min(200, n_samples)`
 
-    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
         Learning rate schedule for weight updates.
 
         - 'constant' is a constant learning rate given by
@@ -1151,55 +1151,55 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
 
         Only used when solver='sgd'.
 
-    learning_rate_init : double, optional, default 0.001
+    learning_rate_init : double, default=0.001
         The initial learning rate used. It controls the step-size
         in updating the weights. Only used when solver='sgd' or 'adam'.
 
-    power_t : double, optional, default 0.5
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate.
         It is used in updating effective learning rate when the learning_rate
         is set to 'invscaling'. Only used when solver='sgd'.
 
-    max_iter : int, optional, default 200
+    max_iter : int, default=200
         Maximum number of iterations. The solver iterates until convergence
         (determined by 'tol') or this number of iterations. For stochastic
         solvers ('sgd', 'adam'), note that this determines the number of epochs
         (how many times each data point will be used), not the number of
         gradient steps.
 
-    shuffle : bool, optional, default True
+    shuffle : bool, default=True
         Whether to shuffle samples in each iteration. Only used when
         solver='sgd' or 'adam'.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance or None, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    tol : float, optional, default 1e-4
+    tol : float, default=1e-4
         Tolerance for the optimization. When the loss or score is not improving
         by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
         unless ``learning_rate`` is set to 'adaptive', convergence is
         considered to be reached and training stops.
 
-    verbose : bool, optional, default False
+    verbose : bool, default=False
         Whether to print progress messages to stdout.
 
-    warm_start : bool, optional, default False
+    warm_start : bool, default=False
         When set to True, reuse the solution of the previous
         call to fit as initialization, otherwise, just erase the
         previous solution. See :term:`the Glossary <warm_start>`.
 
-    momentum : float, default 0.9
+    momentum : float, default=0.9
         Momentum for gradient descent update.  Should be between 0 and 1. Only
         used when solver='sgd'.
 
-    nesterovs_momentum : boolean, default True
+    nesterovs_momentum : boolean, default=True
         Whether to use Nesterov's momentum. Only used when solver='sgd' and
         momentum > 0.
 
-    early_stopping : bool, default False
+    early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to true, it will automatically set
         aside 10% of training data as validation and terminate training when
@@ -1207,29 +1207,29 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         ``n_iter_no_change`` consecutive epochs.
         Only effective when solver='sgd' or 'adam'
 
-    validation_fraction : float, optional, default 0.1
+    validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
         Only used if early_stopping is True
 
-    beta_1 : float, optional, default 0.9
+    beta_1 : float, default=0.9
         Exponential decay rate for estimates of first moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    beta_2 : float, optional, default 0.999
+    beta_2 : float, default=0.999
         Exponential decay rate for estimates of second moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    epsilon : float, optional, default 1e-8
+    epsilon : float, default=1e-8
         Value for numerical stability in adam. Only used when solver='adam'
 
-    n_iter_no_change : int, optional, default 10
+    n_iter_no_change : int, default=10
         Maximum number of epochs to not meet ``tol`` improvement.
         Only effective when solver='sgd' or 'adam'
 
         .. versionadded:: 0.20
 
-    max_fun : int, optional, default 15000
+    max_fun : int, default=15000
         Only used when solver='lbfgs'. Maximum number of function calls.
         The solver iterates until convergence (determined by 'tol'), number
         of iterations reaches max_iter, or this number of function calls.
@@ -1321,12 +1321,12 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y : array-like, shape (n_samples, n_outputs)
+        y : ndarray of shape (n_samples, n_outputs)
             The predicted values.
         """
         check_is_fitted(self)
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index efe3aeda951af..14960a8b2bb22 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -38,25 +38,25 @@ class BernoulliRBM(TransformerMixin, BaseEstimator):
 
     Parameters
     ----------
-    n_components : int, optional
+    n_components : int, default=256
         Number of binary hidden units.
 
-    learning_rate : float, optional
+    learning_rate : float, default=0.1
         The learning rate for weight updates. It is *highly* recommended
         to tune this hyper-parameter. Reasonable values are in the
         10**[0., -3.] range.
 
-    batch_size : int, optional
+    batch_size : int, default=10
         Number of examples per minibatch.
 
-    n_iter : int, optional
+    n_iter : int, default=10
         Number of iterations/sweeps over the training dataset to perform
         during training.
 
-    verbose : int, optional
+    verbose : int, default=0
         The verbosity level. The default, zero, means silent mode.
 
-    random_state : integer or RandomState, optional
+    random_state : integer or RandomState, default=None
         A random number generator instance to define the state of the
         random permutations generator. If an integer is given, it fixes the
         seed. Defaults to the global numpy random number generator.
@@ -113,12 +113,12 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The data to be transformed.
 
         Returns
         -------
-        h : array, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Latent representations of the data.
         """
         check_is_fitted(self)
@@ -131,12 +131,12 @@ def _mean_hiddens(self, v):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer.
 
         Returns
         -------
-        h : array-like, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Corresponding mean field values for the hidden layer.
         """
         p = safe_sparse_dot(v, self.components_.T)
@@ -148,7 +148,7 @@ def _sample_hiddens(self, v, rng):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer to sample from.
 
         rng : RandomState
@@ -156,7 +156,7 @@ def _sample_hiddens(self, v, rng):
 
         Returns
         -------
-        h : array-like, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Values of the hidden layer.
         """
         p = self._mean_hiddens(v)
@@ -167,7 +167,7 @@ def _sample_visibles(self, h, rng):
 
         Parameters
         ----------
-        h : array-like, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Values of the hidden layer to sample from.
 
         rng : RandomState
@@ -175,7 +175,7 @@ def _sample_visibles(self, h, rng):
 
         Returns
         -------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer.
         """
         p = np.dot(h, self.components_)
@@ -188,12 +188,12 @@ def _free_energy(self, v):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer.
 
         Returns
         -------
-        free_energy : array-like, shape (n_samples,)
+        free_energy : ndarray of shape (n_samples,)
             The value of the free energy.
         """
         return (- safe_sparse_dot(v, self.intercept_visible_)
@@ -205,12 +205,12 @@ def gibbs(self, v):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer to start from.
 
         Returns
         -------
-        v_new : array-like, shape (n_samples, n_features)
+        v_new : ndarray of shape (n_samples, n_features)
             Values of the visible layer after one Gibbs step.
         """
         check_is_fitted(self)
@@ -227,7 +227,7 @@ def partial_fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training data.
 
         Returns
@@ -263,7 +263,7 @@ def _fit(self, v_pos, rng):
 
         Parameters
         ----------
-        v_pos : array-like, shape (n_samples, n_features)
+        v_pos : ndarray of shape (n_samples, n_features)
             The data to use for training.
 
         rng : RandomState
@@ -290,12 +290,12 @@ def score_samples(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Values of the visible layer. Must be all-boolean (not checked).
 
         Returns
         -------
-        pseudo_likelihood : array-like, shape (n_samples,)
+        pseudo_likelihood : ndarray of shape (n_samples,)
             Value of the pseudo-likelihood (proxy for likelihood).
 
         Notes
@@ -328,7 +328,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data.
 
         Returns
diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py
index 3e49e94de8bd1..02fc53a7aecc2 100644
--- a/sklearn/neural_network/_stochastic_optimizers.py
+++ b/sklearn/neural_network/_stochastic_optimizers.py
@@ -16,7 +16,7 @@ class BaseOptimizer:
         The concatenated list containing coefs_ and intercepts_ in MLP model.
         Used for initializing velocities and updating params
 
-    learning_rate_init : float, optional, default 0.1
+    learning_rate_init : float, default=0.1
         The initial learning rate used. It controls the step-size in updating
         the weights
 
@@ -80,11 +80,11 @@ class SGDOptimizer(BaseOptimizer):
         The concatenated list containing coefs_ and intercepts_ in MLP model.
         Used for initializing velocities and updating params
 
-    learning_rate_init : float, optional, default 0.1
+    learning_rate_init : float, default=0.1
         The initial learning rate used. It controls the step-size in updating
         the weights
 
-    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default 'constant'
+    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
         Learning rate schedule for weight updates.
 
         -'constant', is a constant learning rate given by
@@ -100,10 +100,10 @@ class SGDOptimizer(BaseOptimizer):
          tol, or fail to increase validation score by tol if 'early_stopping'
          is on, the current learning rate is divided by 5.
 
-    momentum : float, optional, default 0.9
+    momentum : float, default=0.9
         Value of momentum used, must be larger than or equal to 0
 
-    nesterov : bool, optional, default True
+    nesterov : bool, default=True
         Whether to use nesterov's momentum or not. Use nesterov's if True
 
     Attributes
@@ -192,19 +192,19 @@ class AdamOptimizer(BaseOptimizer):
         The concatenated list containing coefs_ and intercepts_ in MLP model.
         Used for initializing velocities and updating params
 
-    learning_rate_init : float, optional, default 0.1
+    learning_rate_init : float, default=0.1
         The initial learning rate used. It controls the step-size in updating
         the weights
 
-    beta_1 : float, optional, default 0.9
+    beta_1 : float, default=0.9
         Exponential decay rate for estimates of first moment vector, should be
         in [0, 1)
 
-    beta_2 : float, optional, default 0.999
+    beta_2 : float, default=0.999
         Exponential decay rate for estimates of second moment vector, should be
         in [0, 1)
 
-    epsilon : float, optional, default 1e-8
+    epsilon : float, default=1e-8
         Value for numerical stability
 
     Attributes