scikit-learn · jnothman · Jul 29, 2019 · Feb 27, 2019 · Apr 1, 2019 · May 14, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -42,11 +42,11 @@ jobs:
       - MINICONDA_PATH: ~/miniconda
       - CONDA_ENV_NAME: testenv
       - PYTHON_VERSION: "2"
-      - NUMPY_VERSION: "1.10"
-      - SCIPY_VERSION: "0.16"
-      - MATPLOTLIB_VERSION: "1.4"
-      - SCIKIT_IMAGE_VERSION: "0.11"
-      - PANDAS_VERSION: "0.17.1"
+      - NUMPY_VERSION: "1.*"
+      - SCIPY_VERSION: "0.*"
+      - MATPLOTLIB_VERSION: "*"
+      - SCIKIT_IMAGE_VERSION: "0.*"
+      - PANDAS_VERSION: "0.*"
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh

diff --git a/.travis.yml b/.travis.yml
@@ -35,12 +35,12 @@ matrix:
             - libatlas-dev
     # Python 3.4 build
     - env: DISTRIB="conda" PYTHON_VERSION="3.4" INSTALL_MKL="false"
-           NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.16.1" CYTHON_VERSION="0.25.2"
-           PILLOW_VERSION="4.0.0" COVERAGE=true
+           NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17" CYTHON_VERSION="0.25.2"
+           PILLOW_VERSION="4.0.0" COVERAGE=
       if: type != cron
     # Python 3.5 build
     - env: DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="false"
-           NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.16.1" CYTHON_VERSION="0.25.2"
+           NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17" CYTHON_VERSION="0.25.2"
            PILLOW_VERSION="4.0.0" COVERAGE=true
            SKLEARN_SITE_JOBLIB=1 JOBLIB_VERSION="0.11"
       if: type != cron

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
@@ -119,7 +119,7 @@ conda update --yes --quiet conda
 # provided versions
 conda create -n $CONDA_ENV_NAME --yes --quiet python="${PYTHON_VERSION:-*}" \
   numpy="${NUMPY_VERSION:-*}" scipy="${SCIPY_VERSION:-*}" cython \
-  pytest coverage matplotlib="${MATPLOTLIB_VERSION:-*}" sphinx=1.6.2 pillow \
+  pytest coverage matplotlib="${MATPLOTLIB_VERSION:-*}" sphinx=1.6.* pillow \
   scikit-image="${SCIKIT_IMAGE_VERSION:-*}" pandas="${PANDAS_VERSION:-*}" \
   joblib
 

diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
@@ -11,7 +11,7 @@
 # matrix entry) from which we pull from local Travis repository. This allows
 # us to keep build artefact for gcc + cython, and gain time
 
-set -e
+set -ex
 
 echo 'List files from cached directories'
 echo 'pip:'
@@ -38,12 +38,18 @@ make_conda() {
     export PATH=$MINICONDA_PATH/bin:$PATH
     conda update --yes conda
 
-    conda create -n testenv --yes $TO_INSTALL
+    conda create -c conda-forge -n testenv --yes $TO_INSTALL
     source activate testenv
 }
 
+if [[ "$COVERAGE" == "true" ]]; then
+    TEST_DEPS="pytest pytest-cov"
+else
+    TEST_DEPS="pytest"
+fi
+
 if [[ "$DISTRIB" == "conda" ]]; then
-    TO_INSTALL="python=$PYTHON_VERSION pip pytest pytest-cov \
+    TO_INSTALL="python=$PYTHON_VERSION pip $TEST_DEPS \
                 numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \
                 cython=$CYTHON_VERSION"
 
@@ -84,7 +90,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     # and scipy
     virtualenv --system-site-packages testvenv
     source testvenv/bin/activate
-    pip install pytest pytest-cov cython==$CYTHON_VERSION
+    pip install $TEST_DEPS cython==$CYTHON_VERSION
 
 elif [[ "$DISTRIB" == "scipy-dev" ]]; then
     make_conda python=3.7
@@ -96,7 +102,7 @@ elif [[ "$DISTRIB" == "scipy-dev" ]]; then
     echo "Installing joblib master"
     pip install https://github.com/joblib/joblib/archive/master.zip
     export SKLEARN_SITE_JOBLIB=1
-    pip install pytest pytest-cov
+    pip install $TEST_DEPS
 fi
 
 if [[ "$COVERAGE" == "true" ]]; then

diff --git a/build_tools/travis/test_pytest_soft_dependency.sh b/build_tools/travis/test_pytest_soft_dependency.sh
@@ -7,6 +7,7 @@ if [[ "$CHECK_PYTEST_SOFT_DEPENDENCY" == "true" ]]; then
     if [[ "$COVERAGE" == "true" ]]; then
         # Need to append the coverage to the existing .coverage generated by
         # running the tests
+        pip install coverage
         CMD="coverage run --append"
     else
         CMD="python"

diff --git a/doc/index.rst b/doc/index.rst
@@ -207,20 +207,20 @@
                     <li><em>On-going development:</em>
                     <a href="/dev/whats_new.html"><em>What's new</em> (Changelog)</a>
                     </li>
-                    <li><strong>Scikit-learn 0.21 will drop support for Python 2.7 and Python 3.4.</strong>
+                    <li><strong>Scikit-learn from 0.21 requires Python 3.5 or greater.</strong>
                     </li>
-                    <li><em>March 2019.</em> scikit-learn 0.20.3 is available for download (<a href="whats_new.html#version-0-20-3">Changelog</a>).
+                    <li><em>July 2019.</em> scikit-learn 0.21.3 (<a href="whats_new.html#version-0-21-3">Changelog</a>) and 0.20.4 (<a href="whats_new.html#version-0-20-4">Changelog</a>) are available for download.
+                    </li>
+                    <li><em>May 2019.</em> scikit-learn 0.21.0 to 0.21.2 are available for download (<a href="whats_new.html#version-0-21">Changelog</a>).
                     </li>
-                    <li><em>December 2018.</em> scikit-learn 0.20.2 is available for download (<a href="whats_new.html#version-0-20-2">Changelog</a>)
+                    <li><em>March 2019.</em> scikit-learn 0.20.3 is available for download (<a href="whats_new.html#version-0-20-3">Changelog</a>).
                     </li>
                     <li><em>September 2018.</em> scikit-learn 0.20.0 is available for download (<a href="whats_new.html#version-0-20-0">Changelog</a>).
                     </li>
                     <li><em>July 2018.</em> scikit-learn 0.19.2 is available for download (<a href="whats_new.html#version-0-19">Changelog</a>).
                     </li>
                     <li><em>July 2017.</em> scikit-learn 0.19.0 is available for download (<a href="whats_new/v0.19.html#version-0-19">Changelog</a>).
                     </li>
-                    <li><em>June 2017.</em> scikit-learn 0.18.2 is available for download (<a href="whats_new/v0.18.html#version-0-18-2">Changelog</a>).
-                    </li>
                     </ul>
                 </div>
 

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -2,6 +2,50 @@
 
 .. currentmodule:: sklearn
 
+ .. _changes_0_20_4:
+
+Version 0.20.4
+==============
+
+**July 30, 2019**
+
+This is a bug-fix release with some bug fixes applied to version 0.20.3.
+
+Changelog
+---------
+
+The bundled version of joblib was upgraded from 0.13.0 to 0.13.2.
+
+:mod:`sklearn.cluster`
+..............................
+
+- |Fix| Fixed a bug in :class:`cluster.KMeans` where KMeans++ initialisation
+  could rarely result in an IndexError. :issue:`11756` by `Joel Nothman`_.
+
+:mod:`sklearn.compose`
+.....................
+
+- |Fix| Fixed an issue in :class:`compose.ColumnTransformer` where using
+  DataFrames whose column order differs between :func:``fit`` and
+  :func:``transform`` could lead to silently passing incorrect columns to the
+  ``remainder`` transformer.
+  :pr:`14237` by `Andreas Schuderer <schuderer>`.
+
+:mod:`sklearn.model_selection`
+..............................
+
+- |Fix| Fixed a bug where :class:`model_selection.StratifiedKFold`
+  shuffles each class's samples with the same ``random_state``,
+  making ``shuffle=True`` ineffective.
+  :issue:`13124` by :user:`Hanmin Qin <qinhanmin2014>`.
+
+:mod:`sklearn.neighbors`
+......................
+
+- |Fix| Fixed a bug in :class:`neighbors.KernelDensity` which could not be
+  restored from a pickle if ``sample_weight`` had been used.
+  :issue:`13772` by :user:`Aditya Vyas <aditya1702>`.
+
  .. _changes_0_20_3:
 
 Version 0.20.3
@@ -30,7 +74,7 @@ Changelog
   :issue:`12946` by :user:`Pierre Tallotte <pierretallotte>`.
 
 :mod:`sklearn.covariance`
-......................
+.........................
 
 - |Fix| Fixed a regression in :func:`covariance.graphical_lasso` so that
   the case `n_features=2` is handled correctly. :issue:`13276` by

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
@@ -44,7 +44,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.20.3'
+__version__ = '0.20.4'
 
 
 try:

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
@@ -10,11 +10,11 @@
 # License: BSD 3 clause
 
 import numpy as np
+import warnings
 from scipy import sparse
 
 from ..base import BaseEstimator, ClusterMixin
 from ..utils import check_array, check_consistent_length
-from ..utils.testing import ignore_warnings
 from ..neighbors import NearestNeighbors
 
 from ._dbscan_inner import dbscan_inner
@@ -139,7 +139,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
         X.sum_duplicates()  # XXX: modifies X's internals in-place
 
         # set the diagonal to explicit values, as a point is its own neighbor
-        with ignore_warnings():
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
             X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place
 
         X_mask = X.data <= eps

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
@@ -111,6 +111,9 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
         rand_vals = random_state.random_sample(n_local_trials) * current_pot
         candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
                                         rand_vals)
+        # XXX: numerical imprecision can result in a candidate_id out of range
+        np.clip(candidate_ids, None, closest_dist_sq.size - 1,
+                out=candidate_ids)
 
         # Compute distances to center candidates
         distance_to_candidates = euclidean_distances(

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -83,7 +83,9 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
         the transformers.
         By setting ``remainder`` to be an estimator, the remaining
         non-specified columns will use the ``remainder`` estimator. The
-        estimator must support `fit` and `transform`.
+        estimator must support :term:`fit` and :term:`transform`.
+        Note that using this feature requires that the DataFrame columns
+        input at :term:`fit` and :term:`transform` have identical order.
 
     sparse_threshold : float, default = 0.3
         If the output of the different transfromers contains sparse matrices,
@@ -295,11 +297,17 @@ def _validate_remainder(self, X):
                 "'passthrough', or estimator. '%s' was passed instead" %
                 self.remainder)
 
-        n_columns = X.shape[1]
+        # Make it possible to check for reordered named columns on transform
+        if (hasattr(X, 'columns') and
+                any(_check_key_type(cols, str) for cols in self._columns)):
+            self._df_columns = X.columns
+
+        self._n_features = X.shape[1]
         cols = []
         for columns in self._columns:
             cols.extend(_get_column_indices(X, columns))
-        remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None
+        remaining_idx = list(set(range(self._n_features)) - set(cols))
+        remaining_idx = sorted(remaining_idx) or None
 
         self._remainder = ('remainder', self.remainder, remaining_idx)
 
@@ -488,8 +496,27 @@ def transform(self, X):
 
         """
         check_is_fitted(self, 'transformers_')
-
         X = _check_X(X)
+
+        if self._n_features > X.shape[1]:
+            raise ValueError('Number of features of the input must be equal '
+                             'to or greater than that of the fitted '
+                             'transformer. Transformer n_features is {0} '
+                             'and input n_features is {1}.'
+                             .format(self._n_features, X.shape[1]))
+
+        # No column reordering allowed for named cols combined with remainder
+        if (self._remainder[2] is not None and
+                hasattr(self, '_df_columns') and
+                hasattr(X, 'columns')):
+            n_cols_fit = len(self._df_columns)
+            n_cols_transform = len(X.columns)
+            if (n_cols_transform >= n_cols_fit and
+                    any(X.columns[:n_cols_fit] != self._df_columns)):
+                raise ValueError('Column ordering must be equal for fit '
+                                 'and for transform when using the '
+                                 'remainder keyword')
+
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)
 

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -498,6 +498,17 @@ def test_column_transformer_invalid_columns(remainder):
         assert_raise_message(ValueError, "Specifying the columns",
                              ct.fit, X_array)
 
+    # transformed n_features does not match fitted n_features
+    col = [0, 1]
+    ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
+    ct.fit(X_array)
+    X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
+    ct.transform(X_array_more)  # Should accept added columns
+    X_array_fewer = np.array([[0, 1, 2], ]).T
+    err_msg = 'Number of features'
+    with pytest.raises(ValueError, match=err_msg):
+        ct.transform(X_array_fewer)
+
 
 def test_column_transformer_invalid_transformer():
 
@@ -1033,3 +1044,40 @@ def test_column_transformer_negative_column_indexes():
     tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough')
     tf_2 = ColumnTransformer([('ohe', ohe,  [2])], remainder='passthrough')
     assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
+
+
+@pytest.mark.parametrize("explicit_colname", ['first', 'second'])
+def test_column_transformer_reordered_column_names_remainder(explicit_colname):
+    """Regression test for issue #14223: 'Named col indexing fails with
+       ColumnTransformer remainder on changing DataFrame column ordering'
+
+       Should raise error on changed order combined with remainder.
+       Should allow for added columns in `transform` input DataFrame
+       as long as all preceding columns match.
+    """
+    pd = pytest.importorskip('pandas')
+
+    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
+
+    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
+    X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
+
+    tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
+                           remainder=Trans())
+
+    tf.fit(X_fit_df)
+    err_msg = 'Column ordering must be equal'
+    with pytest.raises(ValueError, match=err_msg):
+        tf.transform(X_trans_df)
+
+    # No error for added columns if ordering is identical
+    X_extended_df = X_fit_df.copy()
+    X_extended_df['third'] = [3, 6, 9]
+    tf.transform(X_extended_df)  # No error should be raised
+
+    # No 'columns' AttributeError when transform input is a numpy array
+    X_array = X_fit_array.copy()
+    err_msg = 'Specifying the columns'
+    with pytest.raises(ValueError, match=err_msg):
+        tf.transform(X_array)
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
@@ -357,13 +357,13 @@ def test_scale_and_stability():
             X_score, Y_score = clf.fit_transform(X, Y)
             clf.set_params(scale=False)
             X_s_score, Y_s_score = clf.fit_transform(X_s, Y_s)
-            assert_array_almost_equal(X_s_score, X_score)
-            assert_array_almost_equal(Y_s_score, Y_score)
+            assert_array_almost_equal(X_s_score, X_score, decimal=4)
+            assert_array_almost_equal(Y_s_score, Y_score, decimal=4)
             # Scaling should be idempotent
             clf.set_params(scale=True)
             X_score, Y_score = clf.fit_transform(X_s, Y_s)
-            assert_array_almost_equal(X_s_score, X_score)
-            assert_array_almost_equal(Y_s_score, Y_score)
+            assert_array_almost_equal(X_s_score, X_score, decimal=4)
+            assert_array_almost_equal(Y_s_score, Y_score, decimal=4)
 
 
 def test_pls_errors():

diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
@@ -134,8 +134,8 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
 
     See also
     --------
-    load_svmlight_files: similar function for loading multiple files in this
-    format, enforcing the same number of features/columns on all of them.
+    load_svmlight_files : similar function for loading multiple files in this
+        format, enforcing the same number of features/columns on all of them.
 
     Examples
     --------

diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
@@ -14,7 +14,7 @@
     ==================== ===============================================
     **Documentation:**       https://joblib.readthedocs.io
 
-    **Download:**            http://pypi.python.org/pypi/joblib#downloads
+    **Download:**            https://pypi.python.org/pypi/joblib#downloads
 
     **Source code:**         https://github.com/joblib/joblib
 
@@ -106,7 +106,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.13.0'
+__version__ = '0.13.2'
 
 
 from .memory import Memory, MemorizedResult, register_store_backend