Skip to content

Commit 75a3aee

Browse files
authored
Merge branch 'master' into optim_tsne
2 parents 7854639 + ee88cf4 commit 75a3aee

File tree

134 files changed

+1638
-1223
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

134 files changed

+1638
-1223
lines changed

.travis.yml

+34-36
Original file line numberDiff line numberDiff line change
@@ -3,62 +3,60 @@ sudo: false
33

44
language: python
55

6-
# Pre-install packages for the ubuntu distribution
76
cache:
87
apt: true
98
directories:
109
- $HOME/.cache/pip
11-
addons:
12-
apt:
13-
packages:
14-
# these only required by the DISTRIB="ubuntu" builds:
15-
- python-scipy
16-
- libatlas3gf-base
17-
- libatlas-dev
10+
1811
dist: trusty
12+
1913
env:
2014
global:
2115
# Directory where tests are run from
2216
- TEST_DIR=/tmp/sklearn
2317
- OMP_NUM_THREADS=4
2418
- OPENBLAS_NUM_THREADS=4
25-
matrix:
19+
20+
matrix:
21+
include:
2622
# This environment tests that scikit-learn can be built against
2723
# versions of numpy, scipy with ATLAS that comes with Ubuntu Trusty 14.04
28-
- DISTRIB="ubuntu" PYTHON_VERSION="2.7" CYTHON_VERSION="0.23.4"
29-
COVERAGE=true
24+
- env: DISTRIB="ubuntu" PYTHON_VERSION="2.7" CYTHON_VERSION="0.23.4"
25+
COVERAGE=true
26+
addons:
27+
apt:
28+
packages:
29+
# these only required by the DISTRIB="ubuntu" builds:
30+
- python-scipy
31+
- libatlas3gf-base
32+
- libatlas-dev
3033
# This environment tests the oldest supported anaconda env
31-
- DISTRIB="conda" PYTHON_VERSION="2.7" INSTALL_MKL="false"
32-
NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.13.3" CYTHON_VERSION="0.23.4"
33-
COVERAGE=true
34-
# This environment tests the newest supported Anaconda release (4.3.1)
34+
- env: DISTRIB="conda" PYTHON_VERSION="2.7" INSTALL_MKL="false"
35+
NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.13.3" CYTHON_VERSION="0.23.5"
36+
COVERAGE=true
37+
# This environment tests the newest supported Anaconda release (4.4.0)
3538
# It also runs tests requiring Pandas.
36-
- DISTRIB="conda" PYTHON_VERSION="3.6" INSTALL_MKL="true"
37-
NUMPY_VERSION="1.11.2" SCIPY_VERSION="0.18.1" PANDAS_VERSION="0.19.2"
38-
CYTHON_VERSION="0.25.2" COVERAGE=true
39+
- env: DISTRIB="conda" PYTHON_VERSION="3.6.1" INSTALL_MKL="true"
40+
NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" PANDAS_VERSION="0.20.1"
41+
CYTHON_VERSION="0.25.2" COVERAGE=true
3942
# This environment use pytest to run the tests. It uses the newest
40-
# supported Anaconda release (4.3.1). It also runs tests requiring Pandas.
41-
- USE_PYTEST="true" DISTRIB="conda" PYTHON_VERSION="3.6" INSTALL_MKL="true"
42-
NUMPY_VERSION="1.11.2" SCIPY_VERSION="0.18.1" PANDAS_VERSION="0.19.2"
43-
CYTHON_VERSION="0.25.2"
43+
# supported Anaconda release (4.4.0). It also runs tests requiring Pandas.
44+
# - env: USE_PYTEST="true" DISTRIB="conda" PYTHON_VERSION="3.6.1"
45+
# INSTALL_MKL="true" NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0"
46+
# PANDAS_VERSION="0.20.1" CYTHON_VERSION="0.25.2"
4447
# flake8 linting on diff wrt common ancestor with upstream/master
45-
- RUN_FLAKE8="true" SKIP_TESTS="true"
46-
DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true"
47-
NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4"
48-
49-
50-
matrix:
51-
allow_failures:
52-
# allow_failures seems to be keyed on the python version
53-
# We are using this to allow failures for DISTRIB=scipy-dev-wheels
54-
- python: 3.5
55-
56-
include:
48+
- env: RUN_FLAKE8="true" SKIP_TESTS="true"
49+
DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true"
50+
NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.23.5"
5751
# This environment tests scikit-learn against numpy and scipy master
5852
# installed from their CI wheels in a virtualenv with the Python
5953
# interpreter provided by travis.
60-
- python: 3.5
61-
env: DISTRIB="scipy-dev-wheels"
54+
# - python: 3.5
55+
# env: DISTRIB="scipy-dev-wheels"
56+
# allow_failures:
57+
# # allow_failures seems to be keyed on the python version
58+
# # We are using this to allow failures for DISTRIB=scipy-dev-wheels
59+
# - python: 3.5
6260

6361
install: source build_tools/travis/install.sh
6462
script: bash build_tools/travis/test_script.sh

benchmarks/bench_plot_nmf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from sklearn.decomposition.nmf import INTEGER_TYPES, _check_init
2525
from sklearn.externals.joblib import Memory
2626
from sklearn.exceptions import ConvergenceWarning
27-
from sklearn.utils.extmath import fast_dot, safe_sparse_dot, squared_norm
27+
from sklearn.utils.extmath import safe_sparse_dot, squared_norm
2828
from sklearn.utils import check_array
2929
from sklearn.utils.validation import check_is_fitted, check_non_negative
3030

@@ -99,7 +99,7 @@ def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0.,
9999
http://www.csie.ntu.edu.tw/~cjlin/nmf/
100100
"""
101101
WtX = safe_sparse_dot(W.T, X)
102-
WtW = fast_dot(W.T, W)
102+
WtW = np.dot(W.T, W)
103103

104104
# values justified in the paper (alpha is renamed gamma)
105105
gamma = 1

doc/conf.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -236,13 +236,12 @@
236236

237237
sphinx_gallery_conf = {
238238
'doc_module': 'sklearn',
239+
'backreferences_dir': os.path.join('modules', 'generated'),
239240
'reference_url': {
240241
'sklearn': None,
241242
'matplotlib': 'http://matplotlib.org',
242243
'numpy': 'http://docs.scipy.org/doc/numpy-1.8.1',
243-
'scipy': 'http://docs.scipy.org/doc/scipy-0.13.3/reference'},
244-
'expected_failing_examples': [
245-
'../examples/applications/plot_stock_market.py']
244+
'scipy': 'http://docs.scipy.org/doc/scipy-0.13.3/reference'}
246245
}
247246

248247

doc/developers/performance.rst

-33
Original file line numberDiff line numberDiff line change
@@ -84,38 +84,6 @@ C/C++ generated files are embedded in distributed stable packages. The goal is
8484
to make it possible to install scikit-learn stable version
8585
on any machine with Python, Numpy, Scipy and C/C++ compiler.
8686

87-
Fast matrix multiplications
88-
===========================
89-
90-
Matrix multiplications (matrix-matrix and matrix-vector) are usually handled
91-
using the NumPy function ``np.dot``, but in versions of NumPy before 1.7.2
92-
this function is suboptimal when the inputs are not both in the C (row-major)
93-
layout; in that case, the inputs may be implicitly copied to obtain the right
94-
layout. This obviously consumes memory and takes time.
95-
96-
The function ``fast_dot`` in ``sklearn.utils.extmath`` offers a fast
97-
replacement for ``np.dot`` that prevents copies from being made in some cases.
98-
In all other cases, it dispatches to ``np.dot`` and when the NumPy version is
99-
new enough, it is in fact an alias for that function, making it a drop-in
100-
replacement. Example usage of ``fast_dot``::
101-
102-
>>> import numpy as np
103-
>>> from sklearn.utils.extmath import fast_dot
104-
>>> X = np.random.random_sample([2, 10])
105-
>>> np.allclose(np.dot(X, X.T), fast_dot(X, X.T))
106-
True
107-
108-
This function operates optimally on 2-dimensional arrays, both of the same
109-
dtype, which should be either single or double precision float. If these
110-
requirements aren't met or the BLAS package is not available, the call is
111-
silently dispatched to ``numpy.dot``. If you want to be sure when the original
112-
``numpy.dot`` has been invoked in a situation where it is suboptimal, you can
113-
activate the related warning::
114-
115-
>>> import warnings
116-
>>> from sklearn.exceptions import NonBLASDotWarning
117-
>>> warnings.simplefilter('always', NonBLASDotWarning) # doctest: +SKIP
118-
11987
.. _profiling-python-code:
12088

12189
Profiling Python code
@@ -425,4 +393,3 @@ A sample algorithmic trick: warm restarts for cross validation
425393

426394
TODO: demonstrate the warm restart tricks for cross validation of linear
427395
regression with Coordinate Descent.
428-

doc/faq.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ How can I contribute to scikit-learn?
2525
-----------------------------------------
2626
See :ref:`contributing`. Before wanting to add a new algorithm, which is
2727
usually a major and lengthy undertaking, it is recommended to start with :ref:`known
28-
issues <easy_issues>`.
28+
issues <easy_issues>`. Please do not contact the contributors of scikit-learn directly
29+
regarding contributing to scikit-learn.
2930

3031
What's the best way to get help on scikit-learn usage?
3132
--------------------------------------------------------------

doc/modules/classes.rst

+3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ Functions
4040
:template: function.rst
4141

4242
base.clone
43+
config_context
44+
set_config
45+
get_config
4346

4447

4548
.. _cluster_ref:

doc/modules/computational_performance.rst

+19
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,25 @@ To benchmark different estimators for your case you can simply change the
6868
:ref:`sphx_glr_auto_examples_applications_plot_prediction_latency.py`. This should give
6969
you an estimate of the order of magnitude of the prediction latency.
7070

71+
.. topic:: Configuring Scikit-learn for reduced validation overhead
72+
73+
Scikit-learn does some validation on data that increases the overhead per
74+
call to ``predict`` and similar functions. In particular, checking that
75+
features are finite (not NaN or infinite) involves a full pass over the
76+
data. If you ensure that your data is acceptable, you may suppress
77+
checking for finiteness by setting the environment variable
78+
``SKLEARN_ASSUME_FINITE`` to a non-empty string before importing
79+
scikit-learn, or configure it in Python with :func:`sklearn.set_config`.
80+
For more control than these global settings, a :func:`config_context`
81+
allows you to set this configuration within a specified context::
82+
83+
>>> import sklearn
84+
>>> with sklearn.config_context(assume_finite=True):
85+
... pass # do learning/prediction here with reduced validation
86+
87+
Note that this will affect all uses of
88+
:func:`sklearn.utils.assert_all_finite` within the context.
89+
7190
Influence of the Number of Features
7291
-----------------------------------
7392

doc/modules/cross_validation.rst

+15-15
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ In this case we would like to know if a model trained on a particular set of
464464
groups generalizes well to the unseen groups. To measure this, we need to
465465
ensure that all the samples in the validation fold come from groups that are
466466
not represented at all in the paired training fold.
467-
467+
468468
The following cross-validation splitters can be used to do that.
469469
The grouping identifier for the samples is specified via the ``groups``
470470
parameter.
@@ -601,29 +601,29 @@ samples that are part of the validation set, and to -1 for all other samples.
601601
Cross validation of time series data
602602
====================================
603603

604-
Time series data is characterised by the correlation between observations
605-
that are near in time (*autocorrelation*). However, classical
606-
cross-validation techniques such as :class:`KFold` and
607-
:class:`ShuffleSplit` assume the samples are independent and
608-
identically distributed, and would result in unreasonable correlation
609-
between training and testing instances (yielding poor estimates of
610-
generalisation error) on time series data. Therefore, it is very important
611-
to evaluate our model for time series data on the "future" observations
612-
least like those that are used to train the model. To achieve this, one
604+
Time series data is characterised by the correlation between observations
605+
that are near in time (*autocorrelation*). However, classical
606+
cross-validation techniques such as :class:`KFold` and
607+
:class:`ShuffleSplit` assume the samples are independent and
608+
identically distributed, and would result in unreasonable correlation
609+
between training and testing instances (yielding poor estimates of
610+
generalisation error) on time series data. Therefore, it is very important
611+
to evaluate our model for time series data on the "future" observations
612+
least like those that are used to train the model. To achieve this, one
613613
solution is provided by :class:`TimeSeriesSplit`.
614614

615615

616616
Time Series Split
617617
-----------------
618618

619-
:class:`TimeSeriesSplit` is a variation of *k-fold* which
620-
returns first :math:`k` folds as train set and the :math:`(k+1)` th
621-
fold as test set. Note that unlike standard cross-validation methods,
619+
:class:`TimeSeriesSplit` is a variation of *k-fold* which
620+
returns first :math:`k` folds as train set and the :math:`(k+1)` th
621+
fold as test set. Note that unlike standard cross-validation methods,
622622
successive training sets are supersets of those that come before them.
623623
Also, it adds all surplus data to the first training partition, which
624624
is always used to train the model.
625625

626-
This class can be used to cross-validate time series data samples
626+
This class can be used to cross-validate time series data samples
627627
that are observed at fixed time intervals.
628628

629629
Example of 3-split time series cross-validation on a dataset with 6 samples::
@@ -634,7 +634,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples::
634634
>>> y = np.array([1, 2, 3, 4, 5, 6])
635635
>>> tscv = TimeSeriesSplit(n_splits=3)
636636
>>> print(tscv) # doctest: +NORMALIZE_WHITESPACE
637-
TimeSeriesSplit(n_splits=3)
637+
TimeSeriesSplit(max_train_size=None, n_splits=3)
638638
>>> for train, test in tscv.split(X):
639639
... print("%s %s" % (train, test))
640640
[0 1 2] [3]

doc/modules/grid_search.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ estimator classes. Typical examples include ``C``, ``kernel`` and ``gamma``
1414
for Support Vector Classifier, ``alpha`` for Lasso, etc.
1515

1616
It is possible and recommended to search the hyper-parameter space for the
17-
best :ref:`cross_validation` score.
17+
best :ref:`cross validation <cross_validation>` score.
1818

1919
Any parameter provided when constructing an estimator may be optimized in this
2020
manner. Specifically, to find the names and current values for all parameters

doc/modules/model_evaluation.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ Here is an example of building custom scorers, and of using the
173173
>>> # and predictions defined below.
174174
>>> loss = make_scorer(my_custom_loss_func, greater_is_better=False)
175175
>>> score = make_scorer(my_custom_loss_func, greater_is_better=True)
176-
>>> ground_truth = [[1, 1]]
176+
>>> ground_truth = [[1], [1]]
177177
>>> predictions = [0, 1]
178178
>>> from sklearn.dummy import DummyClassifier
179179
>>> clf = DummyClassifier(strategy='most_frequent', random_state=0)

doc/sphinxext/sphinx_gallery/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
"""
2-
==============
32
Sphinx Gallery
43
==============
54
65
"""
76
import os
8-
__version__ = '0.1.7'
7+
__version__ = '0.1.11'
98

109

1110
def glr_path_static():

doc/sphinxext/sphinx_gallery/_static/gallery.css

-22
Original file line numberDiff line numberDiff line change
@@ -190,25 +190,3 @@ p.sphx-glr-signature a.reference.external {
190190
margin-left: auto;
191191
display: table;
192192
}
193-
194-
a.sphx-glr-code-links:hover{
195-
text-decoration: none;
196-
}
197-
198-
a.sphx-glr-code-links[tooltip]:hover:before{
199-
background: rgba(0,0,0,.8);
200-
border-radius: 5px;
201-
color: white;
202-
content: attr(tooltip);
203-
padding: 5px 15px;
204-
position: absolute;
205-
z-index: 98;
206-
width: 16em;
207-
word-break: normal;
208-
white-space: normal;
209-
display: inline-block;
210-
text-align: center;
211-
text-indent: 0;
212-
margin-left: 0; /* Use zero to avoid overlapping with sidebar */
213-
margin-top: 1.2em;
214-
}

0 commit comments

Comments
 (0)