Skip to content

Commit e8147fc

Browse files
authored
Merge branch 'master' into n_estimator_should_be_100
2 parents 8d96f42 + 8083ea4 commit e8147fc

File tree

221 files changed

+8553
-14743
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

221 files changed

+8553
-14743
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ benchmarks/bench_covertype_data/
5454
*.prefs
5555
.pydevproject
5656
.idea
57+
.vscode
5758

5859
*.c
5960
*.cpp

.travis.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@ sudo: false
33

44
language: python
55

6-
cache:
7-
apt: true
8-
directories:
9-
- $HOME/.cache/pip
10-
- $HOME/.ccache
6+
# cache:
7+
# apt: true
8+
# directories:
9+
# - $HOME/.cache/pip
10+
# - $HOME/.ccache
1111

1212
dist: trusty
1313

README.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ or ``conda``::
7878
The documentation includes more detailed `installation instructions <http://scikit-learn.org/stable/install.html>`_.
7979

8080

81+
Changelog
82+
---------
83+
84+
See the `changelog <http://scikit-learn.org/dev/whats_new.html>`__
85+
for a history of notable changes to scikit-learn.
86+
8187
Development
8288
-----------
8389

benchmarks/bench_plot_incremental_pca.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from collections import defaultdict
1414
import matplotlib.pyplot as plt
1515
from sklearn.datasets import fetch_lfw_people
16-
from sklearn.decomposition import IncrementalPCA, RandomizedPCA, PCA
16+
from sklearn.decomposition import IncrementalPCA, PCA
1717

1818

1919
def plot_results(X, y, label):
@@ -37,7 +37,6 @@ def plot_feature_times(all_times, batch_size, all_components, data):
3737
plot_results(all_components, all_times['pca'], label="PCA")
3838
plot_results(all_components, all_times['ipca'],
3939
label="IncrementalPCA, bsize=%i" % batch_size)
40-
plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
4140
plt.legend(loc="upper left")
4241
plt.suptitle("Algorithm runtime vs. n_components\n \
4342
LFW, size %i x %i" % data.shape)
@@ -50,7 +49,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data):
5049
plot_results(all_components, all_errors['pca'], label="PCA")
5150
plot_results(all_components, all_errors['ipca'],
5251
label="IncrementalPCA, bsize=%i" % batch_size)
53-
plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
5452
plt.legend(loc="lower left")
5553
plt.suptitle("Algorithm error vs. n_components\n"
5654
"LFW, size %i x %i" % data.shape)
@@ -61,7 +59,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data):
6159
def plot_batch_times(all_times, n_features, all_batch_sizes, data):
6260
plt.figure()
6361
plot_results(all_batch_sizes, all_times['pca'], label="PCA")
64-
plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
6562
plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
6663
plt.legend(loc="lower left")
6764
plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
@@ -92,11 +89,9 @@ def fixed_batch_size_comparison(data):
9289
all_errors = defaultdict(list)
9390
for n_components in all_features:
9491
pca = PCA(n_components=n_components)
95-
rpca = RandomizedPCA(n_components=n_components, random_state=1999)
9692
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
9793
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
98-
('ipca', ipca),
99-
('rpca', rpca)]}
94+
('ipca', ipca)]}
10095

10196
for k in sorted(results_dict.keys()):
10297
all_times[k].append(results_dict[k]['time'])
@@ -116,7 +111,8 @@ def variable_batch_size_comparison(data):
116111
all_times = defaultdict(list)
117112
all_errors = defaultdict(list)
118113
pca = PCA(n_components=n_components)
119-
rpca = RandomizedPCA(n_components=n_components, random_state=1999)
114+
rpca = PCA(n_components=n_components, svd_solver='randomized',
115+
random_state=1999)
120116
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
121117
('rpca', rpca)]}
122118

@@ -138,8 +134,6 @@ def variable_batch_size_comparison(data):
138134
all_errors['ipca'].append(results_dict['ipca']['error'])
139135

140136
plot_batch_times(all_times, n_components, batch_sizes, data)
141-
# RandomizedPCA error is always worse (approx 100x) than other PCA
142-
# tests
143137
plot_batch_errors(all_errors, n_components, batch_sizes, data)
144138

145139
faces = fetch_lfw_people(resize=.2, min_faces_per_person=5)

build_tools/circle/list_versions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ def get_pdf_size(version):
4747
return human_readable_data_quantity(path_details['size'], 1000)
4848

4949

50+
print(':orphan:')
51+
print()
5052
heading = 'Available documentation for Scikit-learn'
5153
print(heading)
5254
print('=' * len(heading))

doc/conf.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,6 @@
7070
# The encoding of source files.
7171
#source_encoding = 'utf-8'
7272

73-
# Generate the plots for the gallery
74-
plot_gallery = True
75-
7673
# The master toctree document.
7774
master_doc = 'index'
7875

@@ -102,7 +99,7 @@
10299

103100
# List of patterns, relative to source directory, that match files and
104101
# directories to ignore when looking for source files.
105-
exclude_patterns = ['_build', 'templates', 'includes']
102+
exclude_patterns = ['_build', 'templates', 'includes', 'themes']
106103

107104
# The reST default role (used for this markup: `text`) to use for all
108105
# documents.

doc/conftest.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from sklearn.utils.testing import SkipTest
77
from sklearn.utils.testing import check_skip_network
88
from sklearn.datasets import get_data_home
9+
from sklearn.datasets.base import _pkl_filepath
10+
from sklearn.datasets.twenty_newsgroups import CACHE_NAME
911
from sklearn.utils.testing import install_mldata_mock
1012
from sklearn.utils.testing import uninstall_mldata_mock
1113

@@ -47,26 +49,50 @@ def setup_rcv1():
4749

4850
def setup_twenty_newsgroups():
4951
data_home = get_data_home()
50-
if not exists(join(data_home, '20news_home')):
52+
cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
53+
if not exists(cache_path):
5154
raise SkipTest("Skipping dataset loading doctests")
5255

5356

5457
def setup_working_with_text_data():
5558
check_skip_network()
59+
cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
60+
if not exists(cache_path):
61+
raise SkipTest("Skipping dataset loading doctests")
62+
63+
64+
def setup_compose():
65+
try:
66+
import pandas # noqa
67+
except ImportError:
68+
raise SkipTest("Skipping compose.rst, pandas not installed")
69+
70+
71+
def setup_impute():
72+
try:
73+
import pandas # noqa
74+
except ImportError:
75+
raise SkipTest("Skipping impute.rst, pandas not installed")
5676

5777

5878
def pytest_runtest_setup(item):
5979
fname = item.fspath.strpath
60-
if fname.endswith('datasets/labeled_faces.rst'):
80+
is_index = fname.endswith('datasets/index.rst')
81+
if fname.endswith('datasets/labeled_faces.rst') or is_index:
6182
setup_labeled_faces()
62-
elif fname.endswith('datasets/mldata.rst'):
83+
elif fname.endswith('datasets/mldata.rst') or is_index:
6384
setup_mldata()
64-
elif fname.endswith('datasets/rcv1.rst'):
85+
elif fname.endswith('datasets/rcv1.rst') or is_index:
6586
setup_rcv1()
66-
elif fname.endswith('datasets/twenty_newsgroups.rst'):
87+
elif fname.endswith('datasets/twenty_newsgroups.rst') or is_index:
6788
setup_twenty_newsgroups()
68-
elif fname.endswith('tutorial/text_analytics/working_with_text_data.rst'):
89+
elif fname.endswith('tutorial/text_analytics/working_with_text_data.rst')\
90+
or is_index:
6991
setup_working_with_text_data()
92+
elif fname.endswith('modules/compose.rst') or is_index:
93+
setup_compose()
94+
elif fname.endswith('modules/impute.rst'):
95+
setup_impute()
7096

7197

7298
def pytest_runtest_teardown(item):

doc/datasets/covtype.rst

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
21
.. _covtype:
32

43
Forest covertypes
5-
=================
4+
-----------------
65

76
The samples in this dataset correspond to 30×30m patches of forest in the US,
87
collected for the task of predicting each patch's cover type,
98
i.e. the dominant species of tree.
109
There are seven covertypes, making this a multiclass classification problem.
1110
Each sample has 54 features, described on the
12-
`dataset's homepage <http://archive.ics.uci.edu/ml/datasets/Covertype>`_.
11+
`dataset's homepage <http://archive.ics.uci.edu/ml/datasets/Covertype>`__.
1312
Some of the features are boolean indicators,
1413
while others are discrete or continuous measurements.
1514

0 commit comments

Comments
 (0)