From ce3b3803857f13a84ef18648cc78585db0d08e52 Mon Sep 17 00:00:00 2001
From: Thierry Guillemot <thierry.guillemot.work@gmail.com>
Date: Wed, 31 Aug 2016 10:34:10 +0200
Subject: [PATCH] Add Dirichlet process prior to BayesianGaussianMixture

---
 doc/modules/mixture.rst                       | 277 +++++++++---------
 doc/whats_new.rst                             |  14 +-
 .../mixture/plot_bayesian_gaussian_mixture.py | 114 -------
 examples/mixture/plot_concentration_prior.py  | 135 +++++++++
 examples/mixture/plot_gmm.py                  |  31 +-
 examples/mixture/plot_gmm_sin.py              | 114 +++++--
 sklearn/mixture/base.py                       |  57 +++-
 sklearn/mixture/bayesian_mixture.py           | 149 +++++++---
 sklearn/mixture/dpgmm.py                      |  21 +-
 sklearn/mixture/gaussian_mixture.py           |  16 +-
 sklearn/mixture/gmm.py                        |  10 +
 .../mixture/tests/test_bayesian_mixture.py    | 230 +++++++++------
 sklearn/mixture/tests/test_dpgmm.py           |  27 +-
 .../mixture/tests/test_gaussian_mixture.py    |  60 +++-
 sklearn/mixture/tests/test_gmm.py             |   8 +-
 15 files changed, 822 insertions(+), 441 deletions(-)
 delete mode 100644 examples/mixture/plot_bayesian_gaussian_mixture.py
 create mode 100644 examples/mixture/plot_concentration_prior.py
diff --git a/doc/modules/mixture.rst b/doc/modules/mixture.rst
index cf9c3ea7e7e5a..2449901e90cd2 100644
--- a/doc/modules/mixture.rst
+++ b/doc/modules/mixture.rst
@@ -8,7 +8,7 @@ Gaussian mixture models
 
 .. currentmodule:: sklearn.mixture
 
-`sklearn.mixture` is a package which enables one to learn
+``sklearn.mixture`` is a package which enables one to learn
 Gaussian Mixture Models (diagonal, spherical, tied and full covariance
 matrices supported), sample them, and estimate them from
 data. Facilities to help determine the appropriate number of
@@ -93,14 +93,16 @@ Cons
    or information theoretical criteria to decide how many components to use
    in the absence of external cues.
 
-Selecting the number of components in a classical GMM
-------------------------------------------------------
+Selecting the number of components in a classical Gaussian Mixture Model
+------------------------------------------------------------------------
 
-The BIC criterion can be used to select the number of components in a GMM
-in an efficient way. In theory, it recovers the true number of components
-only in the asymptotic regime (i.e. if much data is available).
-Note that using a :ref:`DPGMM <dpgmm>` avoids the specification of the
-number of components for a Gaussian mixture model.
+The BIC criterion can be used to select the number of components in a Gaussian
+Mixture in an efficient way. In theory, it recovers the true number of
+components only in the asymptotic regime (i.e. if much data is available and
+assuming that the data was actually generated i.i.d. from a mixture of Gaussian
+distribution). Note that using a :ref:`Variational Bayesian Gaussian mixture <bgmm>`
+avoids the specification of the number of components for a Gaussian mixture
+model.
 
 .. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_selection_001.png
    :target: ../auto_examples/mixture/plot_gmm_selection.html
@@ -135,11 +137,12 @@ to a local optimum.
 
 .. _bgmm:
 
-Bayesian Gaussian Mixture
-=========================
+Variational Bayesian Gaussian Mixture
+=====================================
 
-The :class:`BayesianGaussianMixture` object implements a variant of the Gaussian
-mixture model with variational inference algorithms.
+The :class:`BayesianGaussianMixture` object implements a variant of the
+Gaussian mixture model with variational inference algorithms. The API is
+similar as the one defined by :class:`GaussianMixture`.
 
 .. _variational_inference:
 
@@ -152,171 +155,167 @@ priors) instead of data likelihood. The principle behind
 variational methods is the same as expectation-maximization (that is
 both are iterative algorithms that alternate between finding the
 probabilities for each point to be generated by each mixture and
-fitting the mixtures to these assigned points), but variational
+fitting the mixture to these assigned points), but variational
 methods add regularization by integrating information from prior
 distributions. This avoids the singularities often found in
 expectation-maximization solutions but introduces some subtle biases
 to the model. Inference is often notably slower, but not usually as
 much so as to render usage unpractical.
 
-Due to its Bayesian nature, the variational algorithm needs more
-hyper-parameters than expectation-maximization, the most
-important of these being the concentration parameter ``dirichlet_concentration_prior``. Specifying
-a high value of prior of the dirichlet concentration leads more often to uniformly-sized mixture
-components, while specifying small (between 0 and 1) values will lead
-to some mixture components getting almost all the points while most
-mixture components will be centered on just a few of the remaining
-points.
-
-.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_bayesian_gaussian_mixture_001.png
-   :target: ../auto_examples/mixture/plot_bayesian_gaussian_mixture.html
-   :align: center
-   :scale: 50%
-
-.. topic:: Examples:
-
-    * See :ref:`sphx_glr_auto_examples_plot_bayesian_gaussian_mixture.py` for a comparaison of
-      the results of the ``BayesianGaussianMixture`` for different values
-      of the parameter ``dirichlet_concentration_prior``.
-
-Pros and cons of variational inference with :class:BayesianGaussianMixture
---------------------------------------------------------------------------
-
-Pros
-.....
-
-:Regularization: due to the incorporation of prior information,
-   variational solutions have less pathological special cases than
-   expectation-maximization solutions.
-
-:Automatic selection: when `dirichlet_concentration_prior` is small enough and
-   `n_components` is larger than what is found necessary by the model, the
-   Variational Bayesian mixture model has a natural tendency to set some mixture
-   weights values close to zero. This makes it possible to let the model choose a
-   suitable number of effective components automatically.
-
-Cons
-.....
-
-:Bias: to regularize a model one has to add biases. The
-   variational algorithm will bias all the means towards the origin
-   (part of the prior information adds a "ghost point" in the origin
-   to every mixture component) and it will bias the covariances to
-   be more spherical. It will also, depending on the concentration
-   parameter, bias the cluster structure either towards uniformity
-   or towards a rich-get-richer scenario.
-
-:Hyperparameters: this algorithm needs an extra hyperparameter
-   that might need experimental tuning via cross-validation.
+Due to its Bayesian nature, the variational algorithm needs more hyper-
+parameters than expectation-maximization, the most important of these being the
+concentration parameter ``weight_concentration_prior``. Specifying a low value
+for the concentration prior will make the model put most of the weight on few
+components set the remaining components weights very close to zero. High values
+of the concentration prior will allow a larger number of components to be active
+in the mixture.
+
+The parameters implementation of the :class:`BayesianGaussianMixture` class
+proposes two types of prior for the weights distribution: a finite mixture model
+with Dirichlet distribution and an infinite mixture model with the Dirichlet
+Process. In practice Dirichlet Process inference algorithm is approximated and
+uses a truncated distribution with a fixed maximum number of components (called
+the Stick-breaking representation). The number of components actually used
+almost always depends on the data.
+
+The next figure compares the results obtained for the different type of the
+weight concentration prior (parameter ``weight_concentration_prior_type``)
+for different values of ``weight_concentration_prior``.
+Here, we can see the the value of the ``weight_concentration_prior`` parameter
+has a strong impact on the effective number of active components obtained. We
+can also notice that large values for the concentration weight prior lead to
+more uniform weights when the type of prior is 'dirichlet_distribution' while
+this is not necessarily the case for the 'dirichlet_process' type (used by
+default).
+
+.. |plot_bgmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_concentration_prior_002.png
+   :target: ../auto_examples/mixture/plot_gmm.html
+   :scale: 48%
 
-.. _dpgmm:
+.. |plot_dpgmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_concentration_prior_002.png
+   :target: ../auto_examples/mixture/plot_concentration_prior.html
+   :scale: 48%
 
-DPGMM: Infinite Gaussian mixtures
-=================================
+.. centered:: |plot_bgmm| |plot_dpgmm|
+
+The examples below compare Gaussian mixture models with a fixed number of
+components, to the variational Gaussian mixture models with a Dirichlet process
+prior. Here, a classical Gaussian mixture is fitted with 5 components on a
+dataset composed of 2 clusters. We can see that the variational Gaussian mixture
+with a Dirichlet process prior is able to limit itself to only 2 components
+whereas the Gaussian mixture fits the data with a fixed number of components
+that has to be set a priori by the user. In this case the user has selected
+``n_components=5`` which does not match the true generative distribution of this
+toy dataset. Note that with very little observations, the variational Gaussian
+mixture models with a Dirichlet process prior can take a conservative stand, and
+fit only one component.
+
+.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_001.png
+   :target: ../auto_examples/mixture/plot_gmm.html
+   :align: center
+   :scale: 70%
 
-The :class:`DPGMM` object implements a variant of the Gaussian mixture
-model with a variable (but bounded) number of components using the
-Dirichlet Process.
-This class doesn't require the user to choose the number of
-components, and at the expense of extra computational time the user
-only needs to specify a loose upper bound on this number and a
-concentration parameter.
 
-.. |plot_gmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_001.png
-   :target: ../auto_examples/mixture/plot_gmm.html
-   :scale: 48%
+On the following figure we are fitting a dataset not well-depicted by a
+Gaussian mixture. Adjusting the ``weight_concentration_prior``, parameter of the
+class:`BayesianGaussianMixture` controls the number of components used to fit
+this data. We also present on the last two plots a random sampling generated
+from the two resulting mixtures.
 
-.. |plot_gmm_sin| image:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_sin_001.png
+.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_sin_001.png
    :target: ../auto_examples/mixture/plot_gmm_sin.html
-   :scale: 48%
-
-.. centered:: |plot_gmm| |plot_gmm_sin|
+   :align: center
+   :scale: 65%
 
 
-The examples above compare Gaussian mixture models with fixed number of
-components, to DPGMM models. **On the left** the GMM is fitted with 5
-components on a dataset composed of 2 clusters. We can see that the DPGMM is
-able to limit itself to only 2 components whereas the GMM fits the data fit too
-many components. Note that with very little observations, the DPGMM can take a
-conservative stand, and fit only one component. **On the right** we are fitting
-a dataset not well-depicted by a Gaussian mixture. Adjusting the `alpha`
-parameter of the DPGMM controls the number of components used to fit this
-data.
 
 .. topic:: Examples:
 
-    * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm.py` for an example on plotting the
-      confidence ellipsoids for both :class:`GaussianMixture`
-      and :class:`DPGMM`.
+    * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm.py` for an example on
+      plotting the confidence ellipsoids for both :class:`GaussianMixture`
+      and :class:`BayesianGaussianMixture`.
 
     * :ref:`sphx_glr_auto_examples_mixture_plot_gmm_sin.py` shows using
-      :class:`GaussianMixture` and :class:`DPGMM` to fit a sine wave
+      :class:`GaussianMixture` and :class:`BayesianGaussianMixture` to fit a
+      sine wave.
+
+    * See :ref:`sphx_glr_auto_examples_mixture_plot_concentration_prior.py`
+      for an example plotting the confidence ellipsoids for the
+      :class:`BayesianGaussianMixture` with different
+      ``weight_concentration_prior_type`` for different values of the parameter
+      ``weight_concentration_prior``.
 
-Pros and cons of class :class:`DPGMM`: Dirichlet process mixture model
-----------------------------------------------------------------------
+
+Pros and cons of variational inference with :class:`BayesianGaussianMixture`
+----------------------------------------------------------------------------
 
 Pros
 .....
 
-:Less sensitivity to the number of parameters: unlike finite
-   models, which will almost always use all components as much as
-   they can, and hence will produce wildly different solutions for
-   different numbers of components, the Dirichlet process solution
-   won't change much with changes to the parameters, leading to more
-   stability and less tuning.
+:Automatic selection: when ``weight_concentration_prior`` is small enough and
+   ``n_components`` is larger than what is found necessary by the model, the
+   Variational Bayesian mixture model has a natural tendency to set some mixture
+   weights values close to zero. This makes it possible to let the model choose
+   a suitable number of effective components automatically. Only an upper bound
+   of this number needs to be provided. Note however that the "ideal" number of
+   active components is very application specific and is typically ill-defined
+   in a data exploration setting.
+
+:Less sensitivity to the number of parameters: unlike finite models, which will
+   almost always use all components as much as they can, and hence will produce
+   wildly different solutions for different numbers of components, the
+   variantional inference with a Dirichlet process prior
+   (``weight_concentration_prior_type='dirichlet_process'``) won't change much
+   with changes to the parameters, leading to more stability and less tuning.
+
+:Regularization: due to the incorporation of prior information,
+   variational solutions have less pathological special cases than
+   expectation-maximization solutions.
 
-:No need to specify the number of components: only an upper bound of
-   this number needs to be provided. Note however that the DPMM is not
-   a formal model selection procedure, and thus provides no guarantee
-   on the result.
 
 Cons
 .....
 
-:Speed: the extra parametrization necessary for variational
-   inference and for the structure of the Dirichlet process can and
-   will make inference slower, although not by much.
+:Speed: the extra parametrization necessary for variational inference make
+   inference slower, although not by much.
+
+:Hyperparameters: this algorithm needs an extra hyperparameter
+   that might need experimental tuning via cross-validation.
 
-:Bias: as in variational techniques, but only more so, there are
-   many implicit biases in the Dirichlet process and the inference
-   algorithms, and whenever there is a mismatch between these biases
-   and the data it might be possible to fit better models using a
+:Bias: there are many implicit biases in the inference algorithms (and also in
+   the Dirichlet process if used), and whenever there is a mismatch between
+   these biases and the data it might be possible to fit better models using a
    finite mixture.
 
+
 .. _dirichlet_process:
 
 The Dirichlet Process
 ---------------------
 
 Here we describe variational inference algorithms on Dirichlet process
-mixtures. The Dirichlet process is a prior probability distribution on
+mixture. The Dirichlet process is a prior probability distribution on
 *clusterings with an infinite, unbounded, number of partitions*.
 Variational techniques let us incorporate this prior structure on
 Gaussian mixture models at almost no penalty in inference time, comparing
 with a finite Gaussian mixture model.
 
-An important question is how can the Dirichlet process use an
-infinite, unbounded number of clusters and still be consistent. While
-a full explanation doesn't fit this manual, one can think of its
-`chinese restaurant process
-<https://en.wikipedia.org/wiki/Chinese_restaurant_process>`_
-analogy to help understanding it. The
-chinese restaurant process is a generative story for the Dirichlet
-process. Imagine a chinese restaurant with an infinite number of
-tables, at first all empty. When the first customer of the day
-arrives, he sits at the first table. Every following customer will
-then either sit on an occupied table with probability proportional to
-the number of customers in that table or sit in an entirely new table
-with probability proportional to the concentration parameter
-`alpha`. After a finite number of customers has sat, it is easy to see
-that only finitely many of the infinite tables will ever be used, and
-the higher the value of alpha the more total tables will be used. So
-the Dirichlet process does clustering with an unbounded number of
-mixture components by assuming a very asymmetrical prior structure
-over the assignments of points to components that is very concentrated
-(this property is known as rich-get-richer, as the full tables in the
-Chinese restaurant process only tend to get fuller as the simulation
-progresses).
+An important question is how can the Dirichlet process use an infinite,
+unbounded number of clusters and still be consistent. While a full explanation
+doesn't fit this manual, one can think of its `stick breaking process
+<https://en.wikipedia.org/wiki/Dirichlet_process#The_stick-breaking_process>`
+analogy to help understanding it. The stick breaking process is a generative
+story for the Dirichlet process. We start with a unit-length stick and in each
+step we break off a portion of the remaining stick. Each time, we associate the
+length of the piece of the stick to the proportion of points that falls into a
+group of the mixture. At the end, to represent the infinite mixture, we
+associate the last remaining piece of the stick to the proportion of points
+that don't fall into all the other groups. The length of each piece is random
+variable with probability proportional to the concentration parameter. Smaller
+value of the concentration will divide the unit-length into larger pieces of
+the stick (defining more concentrated distribution). Larger concentration
+values will create smaller pieces of the stick (increasing the number of
+components with non zero weights).
 
 Variational inference techniques for the Dirichlet process still work
 with a finite approximation to this infinite mixture model, but
@@ -325,13 +324,3 @@ use, one just specifies the concentration parameter and an upper bound
 on the number of mixture components (this upper bound, assuming it is
 higher than the "true" number of components, affects only algorithmic
 complexity, not the actual number of components used).
-
-.. topic:: Derivation:
-
-   * See `here <dp-derivation.html>`_ the full derivation of this
-     algorithm.
-
-.. toctree::
-    :hidden:
-
-    dp-derivation.rst
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index a450c175ae8fc..5a3e969710a8b 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -419,10 +419,20 @@ API changes summary
      :class:`isotonic.IsotonicRegression`. By `Jonathan Arfa`_.
 
    - The old :class:`VBGMM` is deprecated in favor of the new
-     :class:`BayesianGaussianMixture`. The new class solves the computational
+     :class:`BayesianGaussianMixture` (with the parameter
+     ``weight_concentration_prior_type='dirichlet_distribution'``).
+     The new class solves the computational
+     problems of the old class and computes the Gaussian mixture with a
+     Dirichlet process prior faster than before.
+     (`#7295 <https://github.com/scikit-learn/scikit-learn/pull/7295>`_) by
+     `Wei Xue`_ and `Thierry Guillemot`_.
+
+   - The old :class:`VBGMM` is deprecated in favor of the new
+     :class:`BayesianGaussianMixture` (with the parameter
+     ``weight_concentration_prior_type='dirichlet_distribution'``).
+     The new class solves the computational
      problems of the old class and computes the Variational Bayesian Gaussian
      mixture faster than before.
-     Ref :ref:`b` for more information.
      (`#6651 <https://github.com/scikit-learn/scikit-learn/pull/6651>`_) by
      `Wei Xue`_ and `Thierry Guillemot`_.
 
diff --git a/examples/mixture/plot_bayesian_gaussian_mixture.py b/examples/mixture/plot_bayesian_gaussian_mixture.py
deleted file mode 100644
index 9efea3f04c7fe..0000000000000
--- a/examples/mixture/plot_bayesian_gaussian_mixture.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""
-======================================================
-Bayesian Gaussian Mixture Concentration Prior Analysis
-======================================================
-
-Plot the resulting ellipsoids of a mixture of three Gaussians with
-variational Bayesian Gaussian Mixture for three different values on the
-prior the dirichlet concentration.
-
-For all models, the Variationnal Bayesian Gaussian Mixture adapts its number of
-mixture automatically. The parameter `dirichlet_concentration_prior` has a
-direct link with the resulting number of components. Specifying a high value of
-`dirichlet_concentration_prior` leads more often to uniformly-sized mixture
-components, while specifying small (under 0.1) values will lead to some mixture
-components getting almost all the points while most mixture components will be
-centered on just a few of the remaining points.
-"""
-# Author: Thierry Guillemot <thierry.guillemot.work@gmail.com>
-# License: BSD 3 clause
-
-import numpy as np
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-
-from sklearn.mixture import BayesianGaussianMixture
-
-print(__doc__)
-
-
-def plot_ellipses(ax, weights, means, covars):
-    for n in range(means.shape[0]):
-        v, w = np.linalg.eigh(covars[n][:2, :2])
-        u = w[0] / np.linalg.norm(w[0])
-        angle = np.arctan2(u[1], u[0])
-        angle = 180 * angle / np.pi  # convert to degrees
-        v = 2 * np.sqrt(2) * np.sqrt(v)
-        ell = mpl.patches.Ellipse(means[n, :2], v[0], v[1], 180 + angle)
-        ell.set_clip_box(ax.bbox)
-        ell.set_alpha(weights[n])
-        ax.add_artist(ell)
-
-
-def plot_results(ax1, ax2, estimator, dirichlet_concentration_prior, X, y, plot_title=False):
-    estimator.dirichlet_concentration_prior = dirichlet_concentration_prior
-    estimator.fit(X)
-    ax1.set_title("Bayesian Gaussian Mixture for "
-                  r"$dc_0=%.1e$" % dirichlet_concentration_prior)
-    # ax1.axis('equal')
-    ax1.scatter(X[:, 0], X[:, 1], s=5, marker='o', color=colors[y], alpha=0.8)
-    ax1.set_xlim(-2., 2.)
-    ax1.set_ylim(-3., 3.)
-    ax1.set_xticks(())
-    ax1.set_yticks(())
-    plot_ellipses(ax1, estimator.weights_, estimator.means_,
-                  estimator.covariances_)
-
-    ax2.get_xaxis().set_tick_params(direction='out')
-    ax2.yaxis.grid(True, alpha=0.7)
-    for k, w in enumerate(estimator.weights_):
-        ax2.bar(k - .45, w, width=0.9, color='royalblue', zorder=3)
-        ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.),
-                 horizontalalignment='center')
-    ax2.set_xlim(-.6, 2 * n_components - .4)
-    ax2.set_ylim(0., 1.1)
-    ax2.tick_params(axis='y', which='both', left='off',
-                    right='off', labelleft='off')
-    ax2.tick_params(axis='x', which='both', top='off')
-
-    if plot_title:
-        ax1.set_ylabel('Estimated Mixtures')
-        ax2.set_ylabel('Weight of each component')
-
-# Parameters
-random_state = 2
-n_components, n_features = 3, 2
-colors = np.array(['mediumseagreen', 'royalblue', 'r', 'gold',
-                   'orchid', 'indigo', 'darkcyan', 'tomato'])
-dirichlet_concentration_prior = np.logspace(-3, 3, 3)
-covars = np.array([[[.7, .0], [.0, .1]],
-                   [[.5, .0], [.0, .1]],
-                   [[.5, .0], [.0, .1]]])
-samples = np.array([200, 500, 200])
-means = np.array([[.0, -.70],
-                  [.0, .0],
-                  [.0, .70]])
-
-
-# Here we put beta_prior to 0.8 to minimize the influence of the prior for this
-# dataset
-estimator = BayesianGaussianMixture(n_components=2 * n_components,
-                                    init_params='random', max_iter=1500,
-                                    mean_precision_prior=.8, tol=1e-9,
-                                    random_state=random_state)
-
-# Generate data
-rng = np.random.RandomState(random_state)
-X = np.vstack([
-    rng.multivariate_normal(means[j], covars[j], samples[j])
-    for j in range(n_components)])
-y = np.concatenate([j * np.ones(samples[j], dtype=int)
-                    for j in range(n_components)])
-
-# Plot Results
-plt.figure(figsize=(4.7 * 3, 8))
-plt.subplots_adjust(bottom=.04, top=0.95, hspace=.05, wspace=.05,
-                    left=.03, right=.97)
-
-gs = gridspec.GridSpec(3, len(dirichlet_concentration_prior))
-for k, dc in enumerate(dirichlet_concentration_prior):
-    plot_results(plt.subplot(gs[0:2, k]), plt.subplot(gs[2, k]),
-                 estimator, dc, X, y, plot_title=k == 0)
-
-plt.show()
diff --git a/examples/mixture/plot_concentration_prior.py b/examples/mixture/plot_concentration_prior.py
new file mode 100644
index 0000000000000..49d9e6211b351
--- /dev/null
+++ b/examples/mixture/plot_concentration_prior.py
@@ -0,0 +1,135 @@
+"""
+========================================================================
+Concentration Prior Type Analysis of Variation Bayesian Gaussian Mixture
+========================================================================
+
+This example plots the ellipsoids obtained from a toy dataset (mixture of three
+Gaussians) fitted by the ``BayesianGaussianMixture`` class models with a
+Dirichlet distribution prior
+(``weight_concentration_prior_type='dirichlet_distribution'``) and a Dirichlet
+process prior (``weight_concentration_prior_type='dirichlet_process'``). On
+each figure, we plot the results for three different values of the weight
+concentration prior.
+
+The ``BayesianGaussianMixture`` class can adapt its number of mixture
+componentsautomatically. The parameter ``weight_concentration_prior`` has a
+direct link with the resulting number of components with non-zero weights.
+Specifying a low value for the concentration prior will make the model put most
+of the weight on few components set the remaining components weights very close
+to zero. High values of the concentration prior will allow a larger number of
+components to be active in the mixture.
+
+The Dirichlet process prior allows to define an infinite number of components
+and automatically selects the correct number of components: it activates a
+component only if it is necessary.
+
+On the contrary the classical finite mixture model with a Dirichlet
+distribution prior will favor more uniformly weighted components and therefore
+tends to divide natural clusters into unnecessary sub-components.
+"""
+# Author: Thierry Guillemot <thierry.guillemot.work@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+
+from sklearn.mixture import BayesianGaussianMixture
+
+print(__doc__)
+
+
+def plot_ellipses(ax, weights, means, covars):
+    for n in range(means.shape[0]):
+        eig_vals, eig_vecs = np.linalg.eigh(covars[n])
+        unit_eig_vec = eig_vecs[0] / np.linalg.norm(eig_vecs[0])
+        angle = np.arctan2(unit_eig_vec[1], unit_eig_vec[0])
+        # Ellipse needs degrees
+        angle = 180 * angle / np.pi
+        # eigenvector normalization
+        eig_vals = 2 * np.sqrt(2) * np.sqrt(eig_vals)
+        ell = mpl.patches.Ellipse(means[n], eig_vals[0], eig_vals[1],
+                                  180 + angle)
+        ell.set_clip_box(ax.bbox)
+        ell.set_alpha(weights[n])
+        ell.set_facecolor('#56B4E9')
+        ax.add_artist(ell)
+
+
+def plot_results(ax1, ax2, estimator, X, y, title, plot_title=False):
+    ax1.set_title(title)
+    ax1.scatter(X[:, 0], X[:, 1], s=5, marker='o', color=colors[y], alpha=0.8)
+    ax1.set_xlim(-2., 2.)
+    ax1.set_ylim(-3., 3.)
+    ax1.set_xticks(())
+    ax1.set_yticks(())
+    plot_ellipses(ax1, estimator.weights_, estimator.means_,
+                  estimator.covariances_)
+
+    ax2.get_xaxis().set_tick_params(direction='out')
+    ax2.yaxis.grid(True, alpha=0.7)
+    for k, w in enumerate(estimator.weights_):
+        ax2.bar(k - .45, w, width=0.9, color='#56B4E9', zorder=3)
+        ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.),
+                 horizontalalignment='center')
+    ax2.set_xlim(-.6, 2 * n_components - .4)
+    ax2.set_ylim(0., 1.1)
+    ax2.tick_params(axis='y', which='both', left='off',
+                    right='off', labelleft='off')
+    ax2.tick_params(axis='x', which='both', top='off')
+
+    if plot_title:
+        ax1.set_ylabel('Estimated Mixtures')
+        ax2.set_ylabel('Weight of each component')
+
+# Parameters of the dataset
+random_state, n_components, n_features = 2, 3, 2
+colors = np.array(['#0072B2', '#F0E442', '#D55E00'])
+
+covars = np.array([[[.7, .0], [.0, .1]],
+                   [[.5, .0], [.0, .1]],
+                   [[.5, .0], [.0, .1]]])
+samples = np.array([200, 500, 200])
+means = np.array([[.0, -.70],
+                  [.0, .0],
+                  [.0, .70]])
+
+# mean_precision_prior= 0.8 to minimize the influence of the prior
+estimators = [
+    ("Finite mixture with a Dirichlet distribution\nprior and "
+     r"$\gamma_0=$", BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_distribution",
+        n_components=2 * n_components, reg_covar=0, init_params='random',
+        max_iter=1500, mean_precision_prior=.8,
+        random_state=random_state), [0.001, 1, 1000]),
+    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
+     BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_process",
+        n_components=2 * n_components, reg_covar=0, init_params='random',
+        max_iter=1500, mean_precision_prior=.8,
+        random_state=random_state), [1, 1000, 100000])]
+
+# Generate data
+rng = np.random.RandomState(random_state)
+X = np.vstack([
+    rng.multivariate_normal(means[j], covars[j], samples[j])
+    for j in range(n_components)])
+y = np.concatenate([j * np.ones(samples[j], dtype=int)
+                    for j in range(n_components)])
+
+# Plot results in two different figures
+for (title, estimator, concentrations_prior) in estimators:
+    plt.figure(figsize=(4.7 * 3, 8))
+    plt.subplots_adjust(bottom=.04, top=0.90, hspace=.05, wspace=.05,
+                        left=.03, right=.99)
+
+    gs = gridspec.GridSpec(3, len(concentrations_prior))
+    for k, concentration in enumerate(concentrations_prior):
+        estimator.weight_concentration_prior = concentration
+        estimator.fit(X)
+        plot_results(plt.subplot(gs[0:2, k]), plt.subplot(gs[2, k]), estimator,
+                     X, y, r"%s$%.1e$" % (title, concentration),
+                     plot_title=k == 0)
+
+plt.show()
diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py
index d6b0839c193a7..5f2f8596d4bbe 100644
--- a/examples/mixture/plot_gmm.py
+++ b/examples/mixture/plot_gmm.py
@@ -3,16 +3,18 @@
 Gaussian Mixture Model Ellipsoids
 =================================
 
-Plot the confidence ellipsoids of a mixture of two Gaussians with EM
-and variational Dirichlet process.
-
-Both models have access to five components with which to fit the
-data. Note that the EM model will necessarily use all five components
-while the DP model will effectively only use as many as are needed for
-a good fit. This is a property of the Dirichlet Process prior. Here we
-can see that the EM model splits some components arbitrarily, because it
-is trying to fit too many components, while the Dirichlet Process model
-adapts it number of state automatically.
+Plot the confidence ellipsoids of a mixture of two Gaussians
+obtained with Expectation Maximisation (``GaussianMixture`` class) and
+Variational Inference (``BayesianGaussianMixture`` class models with
+a Dirichlet process prior).
+
+Both models have access to five components with which to fit the data. Note
+that the Expectation Maximisation model will necessarily use all five
+components while the Variational Inference model will effectively only use as
+many as are needed for a good fit. Here we can see that the Expectation
+Maximisation model splits some components arbitrarily, because it is trying to
+fit too many components, while the Dirichlet Process model adapts it number of
+state automatically.
 
 This example doesn't show it, as we're in a low-dimensional space, but
 another advantage of the Dirichlet process model is that it can fit
@@ -56,7 +58,7 @@ def plot_results(X, Y_, means, covariances, index, title):
         ell.set_alpha(0.5)
         splot.add_artist(ell)
 
-    plt.xlim(-10., 10.)
+    plt.xlim(-9., 5.)
     plt.ylim(-3., 6.)
     plt.xticks(())
     plt.yticks(())
@@ -78,8 +80,9 @@ def plot_results(X, Y_, means, covariances, index, title):
              'Gaussian Mixture')
 
 # Fit a Dirichlet process Gaussian mixture using five components
-dpgmm = mixture.DPGMM(n_components=5, covariance_type='full').fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm._get_covars(), 1,
-             'Dirichlet Process GMM')
+dpgmm = mixture.BayesianGaussianMixture(n_components=5,
+                                        covariance_type='full').fit(X)
+plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
+             'Bayesian Gaussian Mixture with a Dirichlet process prior')
 
 plt.show()
diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py
index cc9217740a195..f5fb2ded45120 100644
--- a/examples/mixture/plot_gmm_sin.py
+++ b/examples/mixture/plot_gmm_sin.py
@@ -3,15 +3,40 @@
 Gaussian Mixture Model Sine Curve
 =================================
 
-This example highlights the advantages of the Dirichlet Process:
-complexity control and dealing with sparse data. The dataset is formed
-by 100 points loosely spaced following a noisy sine curve. The fit by
-the GMM class, using the expectation-maximization algorithm to fit a
-mixture of 10 Gaussian components, finds too-small components and very
-little structure. The fits by the Dirichlet process, however, show
-that the model can either learn a global structure for the data (small
-alpha) or easily interpolate to finding relevant local structure
-(large alpha), never falling into the problems shown by the GMM class.
+This example demonstrates the behavior of Gaussian mixture models fit on data
+that was not sampled from a mixture of Gaussian random variables. The dataset
+is formed by 100 points loosely spaced following a noisy sine curve. There is
+therefore no ground truth value for the number of Gaussian components.
+
+The first model is a classical Gaussian Mixture Model with 10 components fit
+with the Expectation-Maximization algorithm.
+
+The second model is a Bayesian Gaussian Mixture Model with a Dirichlet process
+prior fit with variational inference. The low value of the concentration prior
+makes the model favor a lower number of active components. This models
+"decides" to focus its modeling power on the big picture of the structure of
+the dataset: groups of points with alternating directions modeled by
+non-diagonal covariance matrices. Those alternating directions roughly capture
+the alternating nature of the original sine signal.
+
+The third model is also a Bayesian Gaussian mixture model with a Dirichlet
+process prior but this time the value of the concentration prior is higher
+giving the model more liberty to model the fine-grained structure of the data.
+The result is a mixture with a larger number of active components that is
+similar to the first model where we arbitrarily decided to fix the number of
+components to 10.
+
+Which model is the best is a matter of subjective judgement: do we want to
+favor models that only capture the big picture to summarize and explain most of
+the structure of the data while ignoring the details or do we prefer models
+that closely follow the high density regions of the signal?
+
+The last two panels show how we can sample from the last two models. The
+resulting samples distributions do not look exactly like the original data
+distribution. The difference primarily stems from the approximation error we
+made by using a model that assumes that the data was generated by a finite
+number of Gaussian components instead of a continuous noisy sine curve.
+
 """
 
 import itertools
@@ -23,12 +48,14 @@
 
 from sklearn import mixture
 
+print(__doc__)
+
 color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
                               'darkorange'])
 
 
-def plot_results(X, Y_, means, covariances, index, title):
-    splot = plt.subplot(3, 1, 1 + index)
+def plot_results(X, Y, means, covariances, index, title):
+    splot = plt.subplot(5, 1, 1 + index)
     for i, (mean, covar, color) in enumerate(zip(
             means, covariances, color_iter)):
         v, w = linalg.eigh(covar)
@@ -37,9 +64,9 @@ def plot_results(X, Y_, means, covariances, index, title):
         # as the DP will not use every component it has access to
         # unless it needs it, we shouldn't plot the redundant
         # components.
-        if not np.any(Y_ == i):
+        if not np.any(Y == i):
             continue
-        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
+        plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color)
 
         # Plot an ellipse to show the Gaussian component
         angle = np.arctan(u[1] / u[0])
@@ -56,7 +83,24 @@ def plot_results(X, Y_, means, covariances, index, title):
     plt.yticks(())
 
 
-# Number of samples per component
+def plot_samples(X, Y, n_components, index, title):
+    plt.subplot(5, 1, 4 + index)
+    for i, color in zip(range(n_components), color_iter):
+        # as the DP will not use every component it has access to
+        # unless it needs it, we shouldn't plot the redundant
+        # components.
+        if not np.any(Y == i):
+            continue
+        plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color)
+
+    plt.xlim(-6., 4. * np.pi - 6.)
+    plt.ylim(-5., 5.)
+    plt.title(title)
+    plt.xticks(())
+    plt.yticks(())
+
+
+# Parameters
 n_samples = 100
 
 # Generate random sample following a sine curve
@@ -69,22 +113,42 @@ def plot_results(X, Y_, means, covariances, index, title):
     X[i, 0] = x + np.random.normal(0, 0.1)
     X[i, 1] = 3. * (np.sin(x) + np.random.normal(0, .2))
 
+plt.figure(figsize=(10, 10))
+plt.subplots_adjust(bottom=.04, top=0.95, hspace=.2, wspace=.05,
+                    left=.03, right=.97)
+
 # Fit a Gaussian mixture with EM using ten components
 gmm = mixture.GaussianMixture(n_components=10, covariance_type='full',
                               max_iter=100).fit(X)
 plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
              'Expectation-maximization')
 
-# Fit a Dirichlet process Gaussian mixture using ten components
-dpgmm = mixture.DPGMM(n_components=10, covariance_type='full', alpha=0.01,
-                      n_iter=100).fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm._get_covars(), 1,
-             'Dirichlet Process,alpha=0.01')
-
+dpgmm = mixture.BayesianGaussianMixture(
+    n_components=10, covariance_type='full', weight_concentration_prior=1e-2,
+    weight_concentration_prior_type='dirichlet_process',
+    mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
+    init_params="random", max_iter=100, random_state=2).fit(X)
+plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
+             "Bayesian Gaussian mixture models with a Dirichlet process prior "
+             r"for $\gamma_0=0.01$.")
+
+X_s, y_s = dpgmm.sample(n_samples=2000)
+plot_samples(X_s, y_s, dpgmm.n_components, 0,
+             "Gaussian mixture with a Dirichlet process prior "
+             r"for $\gamma_0=0.01$ sampled with $2000$ samples.")
+
+dpgmm = mixture.BayesianGaussianMixture(
+    n_components=10, covariance_type='full', weight_concentration_prior=1e+2,
+    weight_concentration_prior_type='dirichlet_process',
+    mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
+    init_params="kmeans", max_iter=100, random_state=2).fit(X)
+plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 2,
+             "Bayesian Gaussian mixture models with a Dirichlet process prior "
+             r"for $\gamma_0=100$")
+
+X_s, y_s = dpgmm.sample(n_samples=2000)
+plot_samples(X_s, y_s, dpgmm.n_components, 1,
+             "Gaussian mixture with a Dirichlet process prior "
+             r"for $\gamma_0=100$ sampled with $2000$ samples.")
 
-# Fit a Dirichlet process Gaussian mixture using ten components
-dpgmm = mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100.,
-                      n_iter=100).fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm._get_covars(), 2,
-             'Dirichlet Process,alpha=100.')
 plt.show()
diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py
index 062d403228859..38ee12e925694 100644
--- a/sklearn/mixture/base.py
+++ b/sklearn/mixture/base.py
@@ -130,15 +130,17 @@ def _check_parameters(self, X):
         """
         pass
 
-    def _initialize_parameters(self, X):
+    def _initialize_parameters(self, X, random_state):
         """Initialize the model parameters.
 
         Parameters
         ----------
         X : array-like, shape  (n_samples, n_features)
+
+        random_state: RandomState
+            A random number generator instance.
         """
         n_samples, _ = X.shape
-        random_state = check_random_state(self.random_state)
 
         if self.init_params == 'kmeans':
             resp = np.zeros((n_samples, self.n_components))
@@ -195,12 +197,14 @@ def fit(self, X, y=None):
         max_lower_bound = -np.infty
         self.converged_ = False
 
+        random_state = check_random_state(self.random_state)
+
         n_samples, _ = X.shape
         for init in range(n_init):
             self._print_verbose_msg_init_beg(init)
 
             if do_init:
-                self._initialize_parameters(X)
+                self._initialize_parameters(X, random_state)
                 self.lower_bound_ = -np.infty
 
             for n_iter in range(self.max_iter):
@@ -228,7 +232,7 @@ def fit(self, X, y=None):
         if not self.converged_:
             warnings.warn('Initialization %d did not converged. '
                           'Try different init parameters, '
-                          'or increase n_init, tol '
+                          'or increase max_iter, tol '
                           'or check for degenerate data.'
                           % (init + 1), ConvergenceWarning)
 
@@ -355,6 +359,51 @@ def predict_proba(self, X):
         _, log_resp = self._estimate_log_prob_resp(X)
         return np.exp(log_resp)
 
+    def sample(self, n_samples=1):
+        """Generate random samples from the fitted Gaussian distribution.
+
+        Parameters
+        ----------
+        n_samples : int, optional
+            Number of samples to generate. Defaults to 1.
+
+        Returns
+        -------
+        X : array, shape (n_samples, n_features)
+            Randomly generated sample
+        """
+        self._check_is_fitted()
+
+        if n_samples < 1:
+            raise ValueError(
+                "Invalid value for 'n_samples': %d . The sampling requires at "
+                "least one sample." % (self.n_components))
+
+        _, n_features = self.means_.shape
+        rng = check_random_state(self.random_state)
+        n_samples_comp = np.round(self.weights_ * n_samples).astype(int)
+
+        if self.covariance_type == 'full':
+            X = np.vstack([
+                rng.multivariate_normal(mean, covariance, int(sample))
+                for (mean, covariance, sample) in zip(
+                    self.means_, self.covariances_, n_samples_comp)])
+        elif self.covariance_type == "tied":
+            X = np.vstack([
+                rng.multivariate_normal(mean, self.covariances_, int(sample))
+                for (mean, sample) in zip(
+                    self.means_, n_samples_comp)])
+        else:
+            X = np.vstack([
+                mean + rng.randn(sample, n_features) * np.sqrt(covariance)
+                for (mean, covariance, sample) in zip(
+                    self.means_, self.covariances_, n_samples_comp)])
+
+        y = np.concatenate([j * np.ones(sample, dtype=int)
+                           for j, sample in enumerate(n_samples_comp)])
+
+        return (X, y)
+
     def _estimate_weighted_log_prob(self, X):
         """Estimate the weighted log-probabilities, log P(X | Z) + log weights.
 
diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py
index 2b5d4458c879e..97aaacb4564c1 100644
--- a/sklearn/mixture/bayesian_mixture.py
+++ b/sklearn/mixture/bayesian_mixture.py
@@ -5,7 +5,7 @@
 
 import math
 import numpy as np
-from scipy.special import digamma, gammaln
+from scipy.special import betaln, digamma, gammaln
 
 from .base import BaseMixture, _check_shape
 from .gaussian_mixture import _check_precision_matrix
@@ -63,19 +63,29 @@ def _log_wishart_norm(degrees_of_freedom, log_det_precisions_chol, n_features):
 
 
 class BayesianGaussianMixture(BaseMixture):
-    """Variational estimation of a Gaussian mixture.
+    """Variational Bayesian estimation of a Gaussian mixture.
 
     This class allows to infer an approximate posterior distribution over the
     parameters of a Gaussian mixture distribution. The effective number of
     components can be inferred from the data.
 
+    This class implements two types of prior for the weights distribution: a
+    finite mixture model with Dirichlet distribution and an infinite mixture
+    model with the Dirichlet Process. In practice Dirichlet Process inference
+    algorithm is approximated and uses a truncated distribution with a fixed
+    maximum number of components (called the Stick-breaking representation).
+    The number of components actually used almost always depends on the data.
+
+    .. versionadded:: 0.18
+    *BayesianGaussianMixture*.
+
     Read more in the :ref:`User Guide <bgmm>`.
 
     Parameters
     ----------
     n_components : int, defaults to 1.
         The number of mixture components. Depending on the data and the value
-        of the `dirichlet_concentration_prior` the model can decide to not use
+        of the `weight_concentration_prior` the model can decide to not use
         all the components by setting some component `weights_` to values very
         close to zero. The number of effective components is therefore smaller
         than n_components.
@@ -83,10 +93,10 @@ class BayesianGaussianMixture(BaseMixture):
     covariance_type : {'full', 'tied', 'diag', 'spherical'}, defaults to 'full'.
         String describing the type of covariance parameters to use.
         Must be one of::
-        'full' (each component has its own general covariance matrix).
+        'full' (each component has its own general covariance matrix),
         'tied' (all components share the same general covariance matrix),
         'diag' (each component has its own diagonal covariance matrix),
-        'spherical' (each component has its own single variance),
+        'spherical' (each component has its own single variance).
 
     tol : float, defaults to 1e-3.
         The convergence threshold. EM iterations will stop when the
@@ -111,7 +121,13 @@ class BayesianGaussianMixture(BaseMixture):
         'kmeans' : responsibilities are initialized using kmeans.
         'random' : responsibilities are initialized randomly.
 
-    dirichlet_concentration_prior : float | None, optional.
+    weight_concentration_prior_type : str, defaults to 'dirichlet_process'.
+        String describing the type of the weight concentration prior.
+        Must be one of::
+        'dirichlet_process' (using the Stick-breaking representation),
+        'dirichlet_distribution' (can favor more uniform weights).
+
+    weight_concentration_prior : float | None, optional.
         The dirichlet concentration of each component on the weight
         distribution (Dirichlet). The higher concentration puts more mass in
         the center and will lead to more components being active, while a lower
@@ -122,7 +138,7 @@ class BayesianGaussianMixture(BaseMixture):
     mean_precision_prior : float | None, optional.
         The precision prior on the mean distribution (Gaussian).
         Controls the extend to where means can be placed. Smaller
-        values concentrates the means of each clusters around `mean_prior`.
+        values concentrate the means of each clusters around `mean_prior`.
         The value of the parameter must be greater than 0.
         If it is None, it's set to 1.
 
@@ -157,6 +173,9 @@ class BayesianGaussianMixture(BaseMixture):
         it prints also the log probability and the time needed
         for each step.
 
+    verbose_interval : int, default to 10.
+        Number of iteration done before the next print.
+
     Attributes
     ----------
     weights_ : array-like, shape (`n_components`,)
@@ -210,21 +229,25 @@ class BayesianGaussianMixture(BaseMixture):
         Lower bound value on the likelihood (of the training data with
         respect to the model) of the best fit of inference.
 
-    dirichlet_concentration_prior_ : float
+    weight_concentration_prior_ : tuple or float
         The dirichlet concentration of each component on the weight
-        distribution (Dirichlet). The higher concentration puts more mass in
+        distribution (Dirichlet). The type depends on
+        `weight_concentration_prior_type`::
+            (float, float) if 'dirichlet_process' (Beta parameters),
+            float          if 'dirichlet_distribution' (Dirichlet parameters).
+        The higher concentration puts more mass in
         the center and will lead to more components being active, while a lower
         concentration parameter will lead to more mass at the edge of the
         simplex.
 
-    dirichlet_concentration_ : array-like, shape (`n_components`, )
+    weight_concentration_ : array-like, shape (`n_components`, )
         The dirichlet concentration of each component on the weight
         distribution (Dirichlet).
 
     mean_precision_prior : float
         The precision prior on the mean distribution (Gaussian).
         Controls the extend to where means can be placed.
-        Smaller values concentrates the means of each clusters around
+        Smaller values concentrate the means of each clusters around
         `mean_prior`.
 
     mean_precision_ : array-like, shape (`n_components`, )
@@ -251,15 +274,32 @@ class BayesianGaussianMixture(BaseMixture):
     See Also
     --------
     GaussianMixture : Finite Gaussian mixture fit with EM.
+
+    References
+    ----------
+
+    .. [1] `Bishop, Christopher M. (2006). "Pattern recognition and machine
+       learning". Vol. 4 No. 4. New York: Springer.
+       <http://www.springer.com/kr/book/9780387310732>`_
+
+    .. [2] `Hagai Attias. (2000). "A Variational Bayesian Framework for
+       Graphical Models". In Advances in Neural Information Processing
+       Systems 12.
+       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.36.2841&rep=rep1&type=pdf>`_
+
+    .. [3] `Blei, David M. and Michael I. Jordan. (2006). "Variational
+       inference for Dirichlet process mixtures". Bayesian analysis 1.1
+       <http://www.cs.princeton.edu/courses/archive/fall11/cos597C/reading/BleiJordan2005.pdf>
     """
 
     def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
-                 dirichlet_concentration_prior=None,
+                 weight_concentration_prior_type='dirichlet_process',
+                 weight_concentration_prior=None,
                  mean_precision_prior=None, mean_prior=None,
                  degrees_of_freedom_prior=None, covariance_prior=None,
                  random_state=None, warm_start=False, verbose=0,
-                 verbose_interval=20):
+                 verbose_interval=10):
         super(BayesianGaussianMixture, self).__init__(
             n_components=n_components, tol=tol, reg_covar=reg_covar,
             max_iter=max_iter, n_init=n_init, init_params=init_params,
@@ -267,7 +307,8 @@ def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
             verbose=verbose, verbose_interval=verbose_interval)
 
         self.covariance_type = covariance_type
-        self.dirichlet_concentration_prior = dirichlet_concentration_prior
+        self.weight_concentration_prior_type = weight_concentration_prior_type
+        self.weight_concentration_prior = weight_concentration_prior
         self.mean_precision_prior = mean_precision_prior
         self.mean_prior = mean_prior
         self.degrees_of_freedom_prior = degrees_of_freedom_prior
@@ -285,6 +326,15 @@ def _check_parameters(self, X):
                              "'covariance_type' should be in "
                              "['spherical', 'tied', 'diag', 'full']"
                              % self.covariance_type)
+
+        if (self.weight_concentration_prior_type not in
+                ['dirichlet_process', 'dirichlet_distribution']):
+            raise ValueError(
+                "Invalid value for 'weight_concentration_prior_type': %s "
+                "'weight_concentration_prior_type' should be in "
+                "['dirichlet_process', 'dirichlet_distribution']"
+                % self.weight_concentration_prior_type)
+
         self._check_weights_parameters()
         self._check_means_parameters(X)
         self._check_precision_parameters(X)
@@ -292,15 +342,15 @@ def _check_parameters(self, X):
 
     def _check_weights_parameters(self):
         """Check the parameter of the Dirichlet distribution."""
-        if self.dirichlet_concentration_prior is None:
-            self.dirichlet_concentration_prior_ = 1. / self.n_components
-        elif self.dirichlet_concentration_prior > 0.:
-            self.dirichlet_concentration_prior_ = (
-                self.dirichlet_concentration_prior)
+        if self.weight_concentration_prior is None:
+            self.weight_concentration_prior_ = 1. / self.n_components
+        elif self.weight_concentration_prior > 0.:
+            self.weight_concentration_prior_ = (
+                self.weight_concentration_prior)
         else:
-            raise ValueError("The parameter 'dirichlet_concentration_prior' "
+            raise ValueError("The parameter 'weight_concentration_prior' "
                              "should be greater than 0., but got %.3f."
-                             % self.dirichlet_concentration_prior)
+                             % self.weight_concentration_prior)
 
     def _check_means_parameters(self, X):
         """Check the parameters of the Gaussian distribution.
@@ -410,8 +460,16 @@ def _estimate_weights(self, nk):
         ----------
         nk : array-like, shape (n_components,)
         """
-        self.dirichlet_concentration_ = (
-            self.dirichlet_concentration_prior_ + nk)
+        if self.weight_concentration_prior_type == 'dirichlet_process':
+            # For dirichlet process weight_concentration will be a tuple
+            # containing the two parameters of the beta distribution
+            self.weight_concentration_ = (
+                1. + nk,
+                (self.weight_concentration_prior_ +
+                 np.hstack((np.cumsum(nk[::-1])[-2::-1], 0))))
+        else:
+            # case Variationnal Gaussian mixture with dirichlet distribution
+            self.weight_concentration_ = self.weight_concentration_prior_ + nk
 
     def _estimate_means(self, nk, xk):
         """Estimate the parameters of the Gaussian distribution.
@@ -575,7 +633,7 @@ def _estimate_wishart_spherical(self, nk, xk, sk):
         self.covariances_ /= self.degrees_of_freedom_
 
     def _check_is_fitted(self):
-        check_is_fitted(self, ['dirichlet_concentration_', 'mean_precision_',
+        check_is_fitted(self, ['weight_concentration_', 'mean_precision_',
                                'means_', 'degrees_of_freedom_',
                                'covariances_', 'precisions_',
                                'precisions_cholesky_'])
@@ -600,8 +658,17 @@ def _m_step(self, X, log_resp):
         self._estimate_precisions(nk, xk, sk)
 
     def _estimate_log_weights(self):
-        return (digamma(self.dirichlet_concentration_) -
-                digamma(np.sum(self.dirichlet_concentration_)))
+        if self.weight_concentration_prior_type == 'dirichlet_process':
+            digamma_sum = digamma(self.weight_concentration_[0] +
+                                  self.weight_concentration_[1])
+            digamma_a = digamma(self.weight_concentration_[0])
+            digamma_b = digamma(self.weight_concentration_[1])
+            return (digamma_a - digamma_sum +
+                    np.hstack((0, np.cumsum(digamma_b - digamma_sum)[:-1])))
+        else:
+            # case Variationnal Gaussian mixture with dirichlet distribution
+            return (digamma(self.weight_concentration_) -
+                    digamma(np.sum(self.weight_concentration_)))
 
     def _estimate_log_prob(self, X):
         _, n_features = X.shape
@@ -657,25 +724,41 @@ def _compute_lower_bound(self, log_resp, log_prob_norm):
             log_wishart = np.sum(_log_wishart_norm(
                 self.degrees_of_freedom_, log_det_precisions_chol, n_features))
 
-        return (-np.sum(np.exp(log_resp) * log_resp) - log_wishart -
-                _log_dirichlet_norm(self.dirichlet_concentration_) -
+        if self.weight_concentration_prior_type == 'dirichlet_process':
+            log_norm_weight = -np.sum(betaln(self.weight_concentration_[0],
+                                             self.weight_concentration_[1]))
+        else:
+            log_norm_weight = _log_dirichlet_norm(self.weight_concentration_)
+
+        return (-np.sum(np.exp(log_resp) * log_resp) -
+                log_wishart - log_norm_weight -
                 0.5 * n_features * np.sum(np.log(self.mean_precision_)))
 
     def _get_parameters(self):
-        return (self.dirichlet_concentration_,
+        return (self.weight_concentration_,
                 self.mean_precision_, self.means_,
                 self.degrees_of_freedom_, self.covariances_,
                 self.precisions_cholesky_)
 
     def _set_parameters(self, params):
-        (self.dirichlet_concentration_, self.mean_precision_, self.means_,
+        (self.weight_concentration_, self.mean_precision_, self.means_,
          self.degrees_of_freedom_, self.covariances_,
          self.precisions_cholesky_) = params
 
-        # Attributes computation
-        self. weights_ = (self.dirichlet_concentration_ /
-                          np.sum(self.dirichlet_concentration_))
+        # Weights computation
+        if self.weight_concentration_prior_type == "dirichlet_process":
+            weight_dirichlet_sum = (self.weight_concentration_[0] +
+                                    self.weight_concentration_[1])
+            tmp = self.weight_concentration_[1] / weight_dirichlet_sum
+            self.weights_ = (
+                self.weight_concentration_[0] / weight_dirichlet_sum *
+                np.hstack((1, np.cumprod(tmp[:-1]))))
+            self.weights_ /= np.sum(self.weights_)
+        else:
+            self. weights_ = (self.weight_concentration_ /
+                              np.sum(self.weight_concentration_))
 
+        # Precisions matrices computation
         if self.covariance_type == 'full':
             self.precisions_ = np.array([
                 np.dot(prec_chol, prec_chol.T)
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
index ee2bd64911165..a8a1e2d928b11 100644
--- a/sklearn/mixture/dpgmm.py
+++ b/sklearn/mixture/dpgmm.py
@@ -10,6 +10,13 @@
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #
 
+# Important note for the deprecation cleaning of 0.20 :
+# All the function and classes of this file have been deprecated in 0.18.
+# When you remove this file please also remove the related files
+# - 'sklearn/mixture/gmm.py'
+# - 'sklearn/mixture/test_dpgmm.py'
+# - 'sklearn/mixture/test_gmm.py'
+
 import numpy as np
 from scipy.special import digamma as _digamma, gammaln as _gammaln
 from scipy import linalg
@@ -616,9 +623,11 @@ def _fit(self, X, y=None):
         return z
 
 
-@deprecated("The DPGMM class is not working correctly and it's better "
-            "to not use it. DPGMM is deprecated in 0.18 and "
-            "will be removed in 0.20.")
+@deprecated("The `DPGMM` class is not working correctly and it's better "
+            "to use `sklearn.mixture.BayesianGaussianMixture` class with "
+            "parameter `weight_concentration_prior_type='dirichlet_process'` "
+            "instead. DPGMM is deprecated in 0.18 and will be "
+            "removed in 0.20.")
 class DPGMM(_DPGMMBase):
     def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
                  random_state=None, tol=1e-3, verbose=0, min_covar=None,
@@ -630,8 +639,10 @@ def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
             init_params=init_params)
 
 
-@deprecated("The VBGMM class is not working correctly and it's better "
-            "to use sklearn.mixture.BayesianGaussianMixture class instead. "
+@deprecated("The `VBGMM` class is not working correctly and it's better "
+            "to use `sklearn.mixture.BayesianGaussianMixture` class with "
+            "parameter `weight_concentration_prior_type="
+            "'dirichlet_distribution'` instead. "
             "VBGMM is deprecated in 0.18 and will be removed in 0.20.")
 class VBGMM(_DPGMMBase):
     """Variational Inference for the Gaussian Mixture Model
diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py
index 749af6c82fd63..94c3a07d93075 100644
--- a/sklearn/mixture/gaussian_mixture.py
+++ b/sklearn/mixture/gaussian_mixture.py
@@ -439,6 +439,11 @@ class GaussianMixture(BaseMixture):
     This class allows to estimate the parameters of a Gaussian mixture
     distribution.
 
+    .. versionadded:: 0.18
+    *GaussianMixture*.
+
+    Read more in the :ref:`User Guide <gmm>`.
+
     Parameters
     ----------
     n_components : int, defaults to 1.
@@ -448,10 +453,10 @@ class GaussianMixture(BaseMixture):
         defaults to 'full'.
         String describing the type of covariance parameters to use.
         Must be one of::
-        'full' (each component has its own general covariance matrix).
+        'full' (each component has its own general covariance matrix),
         'tied' (all components share the same general covariance matrix),
         'diag' (each component has its own diagonal covariance matrix),
-        'spherical' (each component has its own single variance),
+        'spherical' (each component has its own single variance).
 
     tol : float, defaults to 1e-3.
         The convergence threshold. EM iterations will stop when the
@@ -506,6 +511,9 @@ class GaussianMixture(BaseMixture):
         it prints also the log probability and the time needed
         for each step.
 
+    verbose_interval : int, default to 10.
+        Number of iteration done before the next print.
+
     Attributes
     ----------
     weights_ : array-like, shape (n_components,)
@@ -559,8 +567,8 @@ class GaussianMixture(BaseMixture):
 
     See Also
     --------
-    BayesianGaussianMixture : Finite gaussian mixture model fit with a
-        variational algorithm.
+    BayesianGaussianMixture : Gaussian mixture model fit with a variational
+        inference.
     """
 
     def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
index e5c8734c770bd..d1a4eb818a6c0 100644
--- a/sklearn/mixture/gmm.py
+++ b/sklearn/mixture/gmm.py
@@ -9,6 +9,13 @@
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #         Bertrand Thirion <bertrand.thirion@inria.fr>
 
+# Important note for the deprecation cleaning of 0.20 :
+# All the functions and classes of this file have been deprecated in 0.18.
+# When you remove this file please also remove the related files
+# - 'sklearn/mixture/dpgmm.py'
+# - 'sklearn/mixture/test_dpgmm.py'
+# - 'sklearn/mixture/test_gmm.py'
+
 import numpy as np
 from scipy import linalg
 from time import time
@@ -65,6 +72,9 @@ def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
         X, means, covars)
 
 
+@deprecated("The function sample_gaussian is deprecated in 0.18"
+            " and will be removed in 0.20."
+            " Use numpy.random.multivariate_normal instead.")
 def sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
                     random_state=None):
     """Generate random samples from a Gaussian distribution.
diff --git a/sklearn/mixture/tests/test_bayesian_mixture.py b/sklearn/mixture/tests/test_bayesian_mixture.py
index 3003a9444669d..e678c07d9236f 100644
--- a/sklearn/mixture/tests/test_bayesian_mixture.py
+++ b/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -19,15 +19,16 @@
 
 
 COVARIANCE_TYPE = ['full', 'tied', 'diag', 'spherical']
+PRIOR_TYPE = ['dirichlet_process', 'dirichlet_distribution']
 
 
 def test_log_dirichlet_norm():
     rng = np.random.RandomState(0)
 
-    dirichlet_concentration = rng.rand(2)
-    expected_norm = (gammaln(np.sum(dirichlet_concentration)) -
-                     np.sum(gammaln(dirichlet_concentration)))
-    predected_norm = _log_dirichlet_norm(dirichlet_concentration)
+    weight_concentration = rng.rand(2)
+    expected_norm = (gammaln(np.sum(weight_concentration)) -
+                     np.sum(gammaln(weight_concentration)))
+    predected_norm = _log_dirichlet_norm(weight_concentration)
 
     assert_almost_equal(expected_norm, predected_norm)
 
@@ -64,8 +65,22 @@ def test_bayesian_mixture_covariance_type():
                          "Invalid value for 'covariance_type': %s "
                          "'covariance_type' should be in "
                          "['spherical', 'tied', 'diag', 'full']"
-                         % covariance_type,
-                         bgmm.fit, X)
+                         % covariance_type, bgmm.fit, X)
+
+
+def test_bayesian_mixture_weight_concentration_prior_type():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+    X = rng.rand(n_samples, n_features)
+
+    bad_prior_type = 'bad_prior_type'
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type=bad_prior_type, random_state=rng)
+    assert_raise_message(ValueError,
+                         "Invalid value for 'weight_concentration_prior_type':"
+                         " %s 'weight_concentration_prior_type' should be in "
+                         "['dirichlet_process', 'dirichlet_distribution']"
+                         % bad_prior_type, bgmm.fit, X)
 
 
 def test_bayesian_mixture_weights_prior_initialisation():
@@ -73,29 +88,29 @@ def test_bayesian_mixture_weights_prior_initialisation():
     n_samples, n_components, n_features = 10, 5, 2
     X = rng.rand(n_samples, n_features)
 
-    # Check raise message for a bad value of dirichlet_concentration_prior
-    bad_dirichlet_concentration_prior_ = 0.
+    # Check raise message for a bad value of weight_concentration_prior
+    bad_weight_concentration_prior_ = 0.
     bgmm = BayesianGaussianMixture(
-        dirichlet_concentration_prior=bad_dirichlet_concentration_prior_,
+        weight_concentration_prior=bad_weight_concentration_prior_,
         random_state=0)
     assert_raise_message(ValueError,
-                         "The parameter 'dirichlet_concentration_prior' "
+                         "The parameter 'weight_concentration_prior' "
                          "should be greater than 0., but got %.3f."
-                         % bad_dirichlet_concentration_prior_,
+                         % bad_weight_concentration_prior_,
                          bgmm.fit, X)
 
-    # Check correct init for a given value of dirichlet_concentration_prior
-    dirichlet_concentration_prior = rng.rand()
+    # Check correct init for a given value of weight_concentration_prior
+    weight_concentration_prior = rng.rand()
     bgmm = BayesianGaussianMixture(
-        dirichlet_concentration_prior=dirichlet_concentration_prior,
+        weight_concentration_prior=weight_concentration_prior,
         random_state=rng).fit(X)
-    assert_almost_equal(dirichlet_concentration_prior,
-                        bgmm.dirichlet_concentration_prior_)
+    assert_almost_equal(weight_concentration_prior,
+                        bgmm.weight_concentration_prior_)
 
-    # Check correct init for the default value of dirichlet_concentration_prior
+    # Check correct init for the default value of weight_concentration_prior
     bgmm = BayesianGaussianMixture(n_components=n_components,
                                    random_state=rng).fit(X)
-    assert_almost_equal(1. / n_components, bgmm.dirichlet_concentration_prior_)
+    assert_almost_equal(1. / n_components, bgmm.weight_concentration_prior_)
 
 
 def test_bayesian_mixture_means_prior_initialisation():
@@ -237,44 +252,57 @@ def test_bayesian_mixture_weights():
     n_samples, n_features = 10, 2
 
     X = rng.rand(n_samples, n_features)
-    bgmm = BayesianGaussianMixture(random_state=rng).fit(X)
-
-    # Check the weights values
-    expected_weights = (bgmm.dirichlet_concentration_ /
-                        np.sum(bgmm.dirichlet_concentration_))
-    predected_weights = bgmm.weights_
 
-    assert_almost_equal(expected_weights, predected_weights)
+    # Case Dirichlet distribution for the weight concentration prior type
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_distribution",
+        n_components=3, random_state=rng).fit(X)
 
-    # Check the weights sum = 1
+    expected_weights = (bgmm.weight_concentration_ /
+                        np.sum(bgmm.weight_concentration_))
+    assert_almost_equal(expected_weights, bgmm.weights_)
     assert_almost_equal(np.sum(bgmm.weights_), 1.0)
 
+    # Case Dirichlet process for the weight concentration prior type
+    dpgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_process",
+        n_components=3, random_state=rng).fit(X)
+    weight_dirichlet_sum = (dpgmm.weight_concentration_[0] +
+                            dpgmm.weight_concentration_[1])
+    tmp = dpgmm.weight_concentration_[1] / weight_dirichlet_sum
+    expected_weights = (dpgmm.weight_concentration_[0] / weight_dirichlet_sum *
+                        np.hstack((1, np.cumprod(tmp[:-1]))))
+    expected_weights /= np.sum(expected_weights)
+    assert_almost_equal(expected_weights, dpgmm.weights_)
+    assert_almost_equal(np.sum(dpgmm.weights_), 1.0)
+
 
 @ignore_warnings(category=ConvergenceWarning)
 def test_monotonic_likelihood():
     # We check that each step of the each step of variational inference without
     # regularization improve monotonically the training set of the bound
     rng = np.random.RandomState(0)
-    rand_data = RandomData(rng, scale=7)
+    rand_data = RandomData(rng, scale=20)
     n_components = rand_data.n_components
 
-    for covar_type in COVARIANCE_TYPE:
-        X = rand_data.X[covar_type]
-        bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                       covariance_type=covar_type,
-                                       warm_start=True, max_iter=1,
-                                       random_state=rng, tol=1e-4)
-        current_lower_bound = -np.infty
-        # Do one training iteration at a time so we can make sure that the
-        # training log likelihood increases after each iteration.
-        for _ in range(500):
-            prev_lower_bound = current_lower_bound
-            current_lower_bound = bgmm.fit(X).lower_bound_
-            assert_greater_equal(current_lower_bound, prev_lower_bound)
-
-            if bgmm.converged_:
-                break
-        assert(bgmm.converged_)
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=2 * n_components, covariance_type=covar_type,
+                warm_start=True, max_iter=1, random_state=rng, tol=1e-4)
+            current_lower_bound = -np.infty
+            # Do one training iteration at a time so we can make sure that the
+            # training log likelihood increases after each iteration.
+            for _ in range(600):
+                prev_lower_bound = current_lower_bound
+                current_lower_bound = bgmm.fit(X).lower_bound_
+                assert_greater_equal(current_lower_bound, prev_lower_bound)
+
+                if bgmm.converged_:
+                    break
+            assert(bgmm.converged_)
 
 
 def test_compare_covar_type():
@@ -284,46 +312,55 @@ def test_compare_covar_type():
     rand_data = RandomData(rng, scale=7)
     X = rand_data.X['full']
     n_components = rand_data.n_components
-    # Computation of the full_covariance
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='full',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-    full_covariances = (bgmm.covariances_ *
-                        bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis])
-
-    # Check tied_covariance = mean(full_covariances, 0)
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='tied',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-
-    tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
-    assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))
-
-    # Check diag_covariance = diag(full_covariances)
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='diag',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-
-    diag_covariances = (bgmm.covariances_ *
-                        bgmm.degrees_of_freedom_[:, np.newaxis])
-    assert_almost_equal(diag_covariances,
-                        np.array([np.diag(cov) for cov in full_covariances]))
-
-    # Check spherical_covariance = np.mean(diag_covariances, 0)
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='spherical',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-
-    spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
-    assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
+
+    for prior_type in PRIOR_TYPE:
+        # Computation of the full_covariance
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='full',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+        full_covariances = (
+            bgmm.covariances_ *
+            bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis])
+
+        # Check tied_covariance = mean(full_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='tied',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))
+
+        # Check diag_covariance = diag(full_covariances)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='diag',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        diag_covariances = (bgmm.covariances_ *
+                            bgmm.degrees_of_freedom_[:, np.newaxis])
+        assert_almost_equal(diag_covariances,
+                            np.array([np.diag(cov)
+                                     for cov in full_covariances]))
+
+        # Check spherical_covariance = np.mean(diag_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='spherical',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(
+            spherical_covariances, np.mean(diag_covariances, 1))
 
 
 @ignore_warnings(category=ConvergenceWarning)
@@ -357,3 +394,28 @@ def test_check_covariance_precision():
         else:
             assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                 np.ones(n_components))
+
+
+@ignore_warnings(category=ConvergenceWarning)
+def test_invariant_translation():
+    # We check here that adding a constant in the data change correctly the
+    # parameters of the mixture
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=100)
+    n_components = 2 * rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm1 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components, max_iter=100, random_state=0,
+                tol=1e-3, reg_covar=0).fit(X)
+            bgmm2 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components, max_iter=100, random_state=0,
+                tol=1e-3, reg_covar=0).fit(X + 100)
+
+            assert_almost_equal(bgmm1.means_, bgmm2.means_ - 100)
+            assert_almost_equal(bgmm1.weights_, bgmm2.weights_)
+            assert_almost_equal(bgmm1.covariances_, bgmm2.covariances_)
diff --git a/sklearn/mixture/tests/test_dpgmm.py b/sklearn/mixture/tests/test_dpgmm.py
index 363d31bc29f4f..8ca38626b4cef 100644
--- a/sklearn/mixture/tests/test_dpgmm.py
+++ b/sklearn/mixture/tests/test_dpgmm.py
@@ -1,3 +1,9 @@
+# Important note for the deprecation cleaning of 0.20 :
+# All the function and classes of this file have been deprecated in 0.18.
+# When you remove this file please also remove the related files
+# - 'sklearn/mixture/dpgmm.py'
+# - 'sklearn/mixture/gmm.py'
+# - 'sklearn/mixture/test_gmm.py'
 import unittest
 import sys
 
@@ -143,10 +149,12 @@ def test_wishart_logz():
 
 @ignore_warnings(category=DeprecationWarning)
 def test_DPGMM_deprecation():
-    assert_warns_message(DeprecationWarning, "The DPGMM class is"
-                         " not working correctly and it's better "
-                         "to not use it. DPGMM is deprecated in 0.18 "
-                         "and will be removed in 0.20.", DPGMM)
+    assert_warns_message(
+      DeprecationWarning, "The `DPGMM` class is not working correctly and "
+      "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
+      "with parameter `weight_concentration_prior_type='dirichlet_process'` "
+      "instead. DPGMM is deprecated in 0.18 and will be removed in 0.20.",
+      DPGMM)
 
 
 def do_model(self, **kwds):
@@ -183,11 +191,12 @@ class TestDPGMMWithFullCovars(unittest.TestCase, DPGMMTester):
 
 
 def test_VBGMM_deprecation():
-    assert_warns_message(DeprecationWarning, "The VBGMM class is not working "
-                         "correctly and it's better to use "
-                         "sklearn.mixture.BayesianGaussianMixture class "
-                         "instead. VBGMM is deprecated in 0.18 and will be "
-                         "removed in 0.20.", VBGMM)
+    assert_warns_message(
+        DeprecationWarning, "The `VBGMM` class is not working correctly and "
+        "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
+        "with parameter `weight_concentration_prior_type="
+        "'dirichlet_distribution'` instead. VBGMM is deprecated "
+        "in 0.18 and will be removed in 0.20.", VBGMM)
 
 
 class VBGMMTester(GMMTester):
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index 194248ac7a0df..43d61e010a3a0 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -33,6 +33,7 @@
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.testing import ignore_warnings
 
 
 COVARIANCE_TYPE = ['full', 'tied', 'diag', 'spherical']
@@ -650,7 +651,7 @@ def test_gaussian_mixture_fit_convergence_warning():
         assert_warns_message(ConvergenceWarning,
                              'Initialization %d did not converged. '
                              'Try different init parameters, '
-                             'or increase n_init, tol '
+                             'or increase max_iter, tol '
                              'or check for degenerate data.'
                              % max_iter, g.fit, X)
 
@@ -913,3 +914,60 @@ def test_property():
                                       gmm.covariances_)
         else:
             assert_array_almost_equal(gmm.precisions_, 1. / gmm.covariances_)
+
+
+def test_sample():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_features, n_components = rand_data.n_features, rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+
+        gmm = GaussianMixture(n_components=n_components,
+                              covariance_type=covar_type, random_state=rng)
+        # To sample we need that GaussianMixture is fitted
+        assert_raise_message(NotFittedError, "This GaussianMixture instance "
+                             "is not fitted", gmm.sample, 0)
+        gmm.fit(X)
+
+        assert_raise_message(ValueError, "Invalid value for 'n_samples",
+                             gmm.sample, 0)
+
+        # Just to make sure the class samples correctly
+        X_s, y_s = gmm.sample(20000)
+        for k in range(n_features):
+            if covar_type == 'full':
+                assert_array_almost_equal(gmm.covariances_[k],
+                                          np.cov(X_s[y_s == k].T), decimal=1)
+            elif covar_type == 'tied':
+                assert_array_almost_equal(gmm.covariances_,
+                                          np.cov(X_s[y_s == k].T), decimal=1)
+            elif covar_type == 'diag':
+                assert_array_almost_equal(gmm.covariances_[k],
+                                          np.diag(np.cov(X_s[y_s == k].T)),
+                                          decimal=1)
+            else:
+                assert_array_almost_equal(
+                    gmm.covariances_[k], np.var(X_s[y_s == k] - gmm.means_[k]),
+                    decimal=1)
+
+        means_s = np.array([np.mean(X_s[y_s == k], 0)
+                           for k in range(n_features)])
+        assert_array_almost_equal(gmm.means_, means_s, decimal=1)
+
+
+@ignore_warnings(category=ConvergenceWarning)
+def test_init():
+    # We check that by increasing the n_init number we have a better solution
+    random_state = 0
+    rand_data = RandomData(np.random.RandomState(random_state), scale=1)
+    n_components = rand_data.n_components
+    X = rand_data.X['full']
+
+    gmm1 = GaussianMixture(n_components=n_components, n_init=1,
+                           max_iter=1, random_state=random_state).fit(X)
+    gmm2 = GaussianMixture(n_components=n_components, n_init=100,
+                           max_iter=1, random_state=random_state).fit(X)
+
+    assert_greater(gmm2.lower_bound_, gmm1.lower_bound_)
diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py
index 66e581a582e0d..55f0dfb83f225 100644
--- a/sklearn/mixture/tests/test_gmm.py
+++ b/sklearn/mixture/tests/test_gmm.py
@@ -1,5 +1,9 @@
-# These tests are those of the deprecated GMM class
-
+# Important note for the deprecation cleaning of 0.20 :
+# All the functions and classes of this file have been deprecated in 0.18.
+# When you remove this file please remove the related files
+# - 'sklearn/mixture/dpgmm.py'
+# - 'sklearn/mixture/gmm.py'
+# - 'sklearn/mixture/test_dpgmm.py'
 import unittest
 import copy
 import sys