Add a fit_predict method for the GMM classes

Cory Lorenz · Cory Lorenz · commit ff4d8d26691b · 2015-04-15T14:44:40.000-04:00
With low iterations, the prediction might not be 100% accurate due to
the final maximization step in the EM algorithm.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -21,6 +21,8 @@ Enhancements
 
    - :class:`dummy.DummyClassifier` now supports a prior fitting strategy.
      By `Arnaud Joly`_.
+   - Added a ``fit_predict`` method for :class:`mixture.GMM` and subclasses.
+     By `Cory Lorenz`_.
 
 Bug fixes
 .........
@@ -3413,3 +3415,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Eric Martin: http://ericmart.in
 
 .. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241
+
+.. _Cory Lorenz: https://github.com/clorenz7
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
@@ -480,7 +480,7 @@ def _set_weights(self):
                                                     + self.gamma_[i, 2])
         self.weights_ /= np.sum(self.weights_)
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         """Estimate model parameters with the variational
         algorithm.
 
@@ -595,9 +595,14 @@ def fit(self, X, y=None):
             # Maximization step
             self._do_mstep(X, z, self.params)
 
+        if self.n_iter == 0:
+            # Need to make sure that there is a z value to output
+            # Output zeros because it was just a quick initialization
+            z = np.zeros(X.shape[0], self.n_components)
+
         self._set_weights()
 
-        return self
+        return z
 
 
 class VBGMM(DPGMM):
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
@@ -411,22 +411,44 @@ def sample(self, n_samples=1, random_state=None):
                     num_comp_in_X, random_state=random_state).T
         return X
 
-    def fit(self, X, y=None):
-        """Estimate model parameters with the expectation-maximization
-        algorithm.
+    def fit_predict(self, X, y=None):
+        """
+        Fit and then predict labels for data.
+        Warning: due to the final maximization step in the EM algorithm,
+        with low iterations the prediction may not be 100% accurate
 
-        A initialization step is performed before entering the em
-        algorithm. If you want to avoid this step, set the keyword
-        argument init_params to the empty string '' when creating the
-        GMM object. Likewise, if you would like just to do an
-        initialization, set n_iter=0.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array, shape = (n_samples,)
+        """
+        return self._fit(X, y).argmax(axis=1)
+
+    def _fit(self, X, y=None, do_prediction=False):
+        """Estimate model parameters with the EM algorithm.
+
+        A initialization step is performed before entering the
+        expectation-maximization (EM) algorithm. If you want to avoid
+        this step, set the keyword argument init_params to the empty
+        string '' when creating the GMM object. Likewise, if you would
+        like just to do an initialization, set n_iter=0.
 
         Parameters
         ----------
         X : array_like, shape (n, n_features)
             List of n_features-dimensional data points.  Each row
             corresponds to a single data point.
+
+        Returns
+        -------
+        responsibilities : array, shape (n_samples, n_components)
+            Posterior probabilities of each mixture component for each
+            observation
         """
+
         # initialization step
         X = check_array(X, dtype=np.float64)
         if X.shape[0] < self.n_components:
@@ -501,10 +523,33 @@ def fit(self, X, y=None):
             self.covars_ = best_params['covars']
             self.means_ = best_params['means']
             self.weights_ = best_params['weights']
+        else:
+            # Need to make sure that there are responsibilities to output
+            # Output zeros because it was just a quick initialization
+            responsibilities = np.zeros(X.shape[0], self.n_components)
+
+        return responsibilities
+
+    def fit(self, X, y=None):
+        """Estimate model parameters with the EM algorithm.
+
+        A initialization step is performed before entering the
+        expectation-maximization (EM) algorithm. If you want to avoid
+        this step, set the keyword argument init_params to the empty
+        string '' when creating the GMM object. Likewise, if you would
+        like just to do an initialization, set n_iter=0.
+
+        Parameters
+        ----------
+        X : array_like, shape (n, n_features)
+            List of n_features-dimensional data points.  Each row
+            corresponds to a single data point.
+        """
+        self._fit(X, y)
         return self
 
     def _do_mstep(self, X, responsibilities, params, min_covar=0):
-        """ Perform the Mstep of the EM algorithm and return the class weihgts.
+        """ Perform the Mstep of the EM algorithm and return the class weights
         """
         weights = responsibilities.sum(axis=0)
         weighted_X_sum = np.dot(responsibilities.T, X)
diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py
@@ -8,6 +8,7 @@
 from sklearn import mixture
 from sklearn.datasets.samples_generator import make_spd_matrix
 from sklearn.utils.testing import assert_greater
+from sklearn.metrics.cluster import adjusted_rand_score
 
 rng = np.random.RandomState(0)
 
@@ -327,6 +328,32 @@ def test_1d_1component():
         assert_array_almost_equal(g.bic(X), g_full_bic)
 
 
+def assert_fit_predict_correct(model, X):
+
+    predictions_1 = model.fit(X).predict(X)
+    predictions_2 = model.fit_predict(X)
+
+    assert adjusted_rand_score(predictions_1, predictions_2) == 1.0
+
+
+def test_fit_predict():
+    """
+    test that gmm.fit_predict is equivalent to gmm.fit + gmm.predict
+    """
+    lrng = np.random.RandomState(0)
+
+    n_samples, n_dim, n_comps = 100, 2, 2
+    mu = np.array([[8, 8]])
+    component_0 = lrng.randn(n_samples, n_dim)
+    component_1 = lrng.randn(n_samples, n_dim) + mu
+    X = np.vstack((component_0, component_1))
+
+    for m_constructor in (mixture.GMM, mixture.VBGMM, mixture.DPGMM):
+        model = m_constructor(n_components=n_comps, covariance_type='full',
+                              min_covar=1e-7, n_iter=5, random_state=lrng)
+        assert_fit_predict_correct(model, X)
+
+
 def test_aic():
     # Test the aic and bic criteria
     n_samples, n_dim, n_components = 50, 3, 2