scikit-learn · amueller · Sep 27, 2016 · Sep 27, 2016
diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst
@@ -86,7 +86,7 @@ training samples::
     >>> from sklearn.neural_network import MLPClassifier
     >>> X = [[0., 0.], [1., 1.]]
     >>> y = [0, 1]
-    >>> clf = MLPClassifier(solver='lbgfs', alpha=1e-5,
+    >>> clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
     ...                     hidden_layer_sizes=(5, 2), random_state=1)
     ...
     >>> clf.fit(X, y)                         # doctest: +NORMALIZE_WHITESPACE
@@ -95,7 +95,7 @@ training samples::
            epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate='constant',
            learning_rate_init=0.001, max_iter=200, momentum=0.9,
            nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
-           solver='lbgfs', tol=0.0001, validation_fraction=0.1, verbose=False,
+           solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
            warm_start=False)
 
 After fitting (training), the model can predict labels for new samples::
@@ -134,7 +134,7 @@ indices where the value is `1` represents the assigned classes of that sample::
 
     >>> X = [[0., 0.], [1., 1.]]
     >>> y = [[0, 1], [1, 1]]
-    >>> clf = MLPClassifier(solver='lbgfs', alpha=1e-5,
+    >>> clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
     ...                     hidden_layer_sizes=(15,), random_state=1)
     ...
     >>> clf.fit(X, y)                         # doctest: +NORMALIZE_WHITESPACE
@@ -143,7 +143,7 @@ indices where the value is `1` represents the assigned classes of that sample::
            epsilon=1e-08, hidden_layer_sizes=(15,), learning_rate='constant',
            learning_rate_init=0.001, max_iter=200, momentum=0.9,
            nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
-           solver='lbgfs', tol=0.0001, validation_fraction=0.1, verbose=False,
+           solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
            warm_start=False)
     >>> clf.predict([1., 2.])
     array([[1, 1]])

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
@@ -134,7 +134,7 @@ def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
         with respect to the different parameters given in the initialization.
 
         Returned gradients are packed in a single vector so it can be used
-        in lbgfs
+        in lbfgs
 
         Parameters
         ----------
@@ -345,8 +345,8 @@ def _fit(self, X, y, incremental=False):
             # First time training the model
             self._initialize(y, layer_units)
 
-        # lbgfs does not support mini-batches
-        if self.solver == 'lbgfs':
+        # lbfgs does not support mini-batches
+        if self.solver == 'lbfgs':
             batch_size = n_samples
         elif self.batch_size == 'auto':
             batch_size = min(200, n_samples)
@@ -375,7 +375,7 @@ def _fit(self, X, y, incremental=False):
                                  intercept_grads, layer_units, incremental)
 
         # Run the LBFGS solver
-        elif self.solver == 'lbgfs':
+        elif self.solver == 'lbfgs':
             self._fit_lbfgs(X, y, activations, deltas, coef_grads,
                             intercept_grads, layer_units)
         return self
@@ -422,7 +422,7 @@ def _validate_hyperparameters(self):
         if self.learning_rate not in ["constant", "invscaling", "adaptive"]:
             raise ValueError("learning rate %s is not supported. " %
                              self.learning_rate)
-        supported_solvers = _STOCHASTIC_SOLVERS + ["lbgfs"]
+        supported_solvers = _STOCHASTIC_SOLVERS + ["lbfgs"]
         if self.solver not in supported_solvers:
             raise ValueError("The solver %s is not supported. "
                              " Expected one of: %s" %
@@ -704,10 +704,10 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
 
-    solver : {'lbgfs', 'sgd', 'adam'}, default 'adam'
+    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
         The solver for weight optimization.
 
-        - 'lbgfs' is an optimizer in the family of quasi-Newton methods.
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
 
         - 'sgd' refers to stochastic gradient descent.
 
@@ -717,15 +717,15 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
         Note: The default solver 'adam' works pretty well on relatively
         large datasets (with thousands of training samples or more) in terms of
         both training time and validation score.
-        For small datasets, however, 'lbgfs' can converge faster and perform
+        For small datasets, however, 'lbfgs' can converge faster and perform
         better.
 
     alpha : float, optional, default 0.0001
         L2 penalty (regularization term) parameter.
 
     batch_size : int, optional, default 'auto'
         Size of minibatches for stochastic optimizers.
-        If the solver is 'lbgfs', the classifier will not use minibatch.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
         When set to "auto", `batch_size=min(200, n_samples)`
 
     learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
@@ -1046,10 +1046,10 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
 
-    solver : {'lbgfs', 'sgd', 'adam'}, default 'adam'
+    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
         The solver for weight optimization.
 
-        - 'lbgfs' is an optimizer in the family of quasi-Newton methods.
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
 
         - 'sgd' refers to stochastic gradient descent.
 
@@ -1059,15 +1059,15 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
         Note: The default solver 'adam' works pretty well on relatively
         large datasets (with thousands of training samples or more) in terms of
         both training time and validation score.
-        For small datasets, however, 'lbgfs' can converge faster and perform
+        For small datasets, however, 'lbfgs' can converge faster and perform
         better.
 
     alpha : float, optional, default 0.0001
         L2 penalty (regularization term) parameter.
 
     batch_size : int, optional, default 'auto'
         Size of minibatches for stochastic optimizers.
-        If the solver is 'lbgfs', the classifier will not use minibatch.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
         When set to "auto", `batch_size=min(200, n_samples)`
 
     learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'

diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py
@@ -176,7 +176,7 @@ def test_gradient():
 
         for activation in ACTIVATION_TYPES:
             mlp = MLPClassifier(activation=activation, hidden_layer_sizes=10,
-                                solver='lbgfs', alpha=1e-5,
+                                solver='lbfgs', alpha=1e-5,
                                 learning_rate_init=0.2, max_iter=1,
                                 random_state=1)
             mlp.fit(X, y)
@@ -235,7 +235,7 @@ def test_lbfgs_classification():
         expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
 
         for activation in ACTIVATION_TYPES:
-            mlp = MLPClassifier(solver='lbgfs', hidden_layer_sizes=50,
+            mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50,
                                 max_iter=150, shuffle=True, random_state=1,
                                 activation=activation)
             mlp.fit(X_train, y_train)
@@ -250,7 +250,7 @@ def test_lbfgs_regression():
     X = Xboston
     y = yboston
     for activation in ACTIVATION_TYPES:
-        mlp = MLPRegressor(solver='lbgfs', hidden_layer_sizes=50,
+        mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50,
                            max_iter=150, shuffle=True, random_state=1,
                            activation=activation)
         mlp.fit(X, y)
@@ -287,7 +287,7 @@ def test_multilabel_classification():
     # test fit method
     X, y = make_multilabel_classification(n_samples=50, random_state=0,
                                           return_indicator=True)
-    mlp = MLPClassifier(solver='lbgfs', hidden_layer_sizes=50, alpha=1e-5,
+    mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
                         max_iter=150, random_state=0, activation='logistic',
                         learning_rate_init=0.2)
     mlp.fit(X, y)
@@ -305,7 +305,7 @@ def test_multilabel_classification():
 def test_multioutput_regression():
     # Test that multi-output regression works as expected
     X, y = make_regression(n_samples=200, n_targets=5)
-    mlp = MLPRegressor(solver='lbgfs', hidden_layer_sizes=50, max_iter=200,
+    mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50, max_iter=200,
                        random_state=1)
     mlp.fit(X, y)
     assert_greater(mlp.score(X, y), 0.9)
@@ -388,8 +388,8 @@ def test_partial_fit_errors():
     assert_raises(ValueError,
                   MLPClassifier(solver='sgd').partial_fit, X, y, classes=[2])
 
-    # lbgfs doesn't support partial_fit
-    assert_false(hasattr(MLPClassifier(solver='lbgfs'), 'partial_fit'))
+    # lbfgs doesn't support partial_fit
+    assert_false(hasattr(MLPClassifier(solver='lbfgs'), 'partial_fit'))
 
 
 def test_params_errors():
@@ -471,7 +471,7 @@ def test_predict_proba_multilabel():
                                           return_indicator=True)
     n_samples, n_classes = Y.shape
 
-    clf = MLPClassifier(solver='lbgfs', hidden_layer_sizes=30,
+    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30,
                         random_state=0)
     clf.fit(X, Y)
     y_proba = clf.predict_proba(X)
@@ -493,7 +493,7 @@ def test_sparse_matrices():
     X = X_digits_binary[:50]
     y = y_digits_binary[:50]
     X_sparse = csr_matrix(X)
-    mlp = MLPClassifier(solver='lbgfs', hidden_layer_sizes=15,
+    mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=15,
                         random_state=1)
     mlp.fit(X, y)
     pred1 = mlp.predict(X)