diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py index 6f3c607806549..0e6aedf0c6925 100644 --- a/sklearn/cluster/affinity_propagation_.py +++ b/sklearn/cluster/affinity_propagation_.py @@ -298,7 +298,7 @@ def fit(self, X, y=None): affinity_propagation( self.affinity_matrix_, self.preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, damping=self.damping, - copy=self.copy, verbose=self.verbose, return_n_iter=True) + copy=False, verbose=self.verbose, return_n_iter=True) if self.affinity != "precomputed": self.cluster_centers_ = X[self.cluster_centers_indices_].copy() diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 98e2d7a69eb39..64dcb54f96bd9 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -425,7 +425,7 @@ def fit(self, X, y=None): return self._fit(X) def _fit(self, X): - X = check_array(X, accept_sparse='csr', copy=self.copy) + X = check_array(X, accept_sparse='csr', copy=False) threshold = self.threshold branching_factor = self.branching_factor diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index a24078e439b36..58ab520a78a9b 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -236,8 +236,10 @@ def fit(self, X, Y): # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy) - Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) + Xcopy = not X.flags.writable or self.copy + Ycopy = not Y.flags.writable or self.copy + X = check_array(X, dtype=np.float64, copy=Xcopy) + Y = check_array(Y, dtype=np.float64, copy=Ycopy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) @@ -724,8 +726,10 @@ def __init__(self, n_components=2, scale=True, copy=True): def fit(self, X, Y): # copy since this will contains the centered data check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy) - Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) + Xcopy = not X.flags.writable or self.copy + Ycopy = not Y.flags.writable or self.copy + X = check_array(X, dtype=np.float64, copy=Xcopy) + Y = check_array(Y, dtype=np.float64, copy=Ycopy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index c247538951a50..33f43e77937f3 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -151,7 +151,8 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, copy=self.copy, dtype=np.float64) + copy = not X.flags.writable or self.copy + X = check_array(X, copy=copy, dtype=np.float64) n_samples, n_features = X.shape n_components = self.n_components @@ -247,14 +248,15 @@ def transform(self, X): """ check_is_fitted(self, 'components_') - X = check_array(X) + copy = copy if copy is not None else self.copy + X = check_array(X, copy=copy) Ih = np.eye(len(self.components_)) - X_transformed = X - self.mean_ + X -= self.mean_ Wpsi = self.components_ / self.noise_variance_ cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T)) - tmp = fast_dot(X_transformed, Wpsi.T) + tmp = fast_dot(X, Wpsi.T) X_transformed = fast_dot(tmp, cov_z) return X_transformed diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index d95c465d2fc4a..1f711382b2ec8 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -168,7 +168,9 @@ def fit(self, X, y=None): self.explained_variance_ratio_ = None self.noise_variance_ = None - X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32]) + # because fit will perform inplace modifications + copy = not X.flags.writable or self.copy + X = check_array(X, copy=copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bf2aaf47b13bc..3319de5dfcf2b 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -305,7 +305,7 @@ def partial_fit(self, X, y=None): raise TypeError("MinMaxScaler does no support sparse input. " "You may consider to use MaxAbsScaler instead.") - X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True, + X = check_array(X, copy=False, ensure_2d=False, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) if X.ndim == 1: @@ -332,7 +332,7 @@ def partial_fit(self, X, y=None): self.data_range_ = data_range return self - def transform(self, X): + def transform(self, X, copy=None): """Scaling features of X according to feature_range. Parameters @@ -341,8 +341,8 @@ def transform(self, X): Input data that will be transformed. """ check_is_fitted(self, 'scale_') - - X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES) + copy = copy if copy is not None else self.copy + X = check_array(X, copy=copy, ensure_2d=False, dtype=FLOAT_DTYPES) if X.ndim == 1: warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) @@ -350,7 +350,7 @@ def transform(self, X): X += self.min_ return X - def inverse_transform(self, X): + def inverse_transform(self, X, copy=None): """Undo the scaling of X according to feature_range. Parameters @@ -359,8 +359,8 @@ def inverse_transform(self, X): Input data that will be transformed. It cannot be sparse. """ check_is_fitted(self, 'scale_') - - X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES) + copy = copy if copy is not None else self.copy + X = check_array(X, copy=copy, ensure_2d=False, dtype=FLOAT_DTYPES) if X.ndim == 1: warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) @@ -557,7 +557,7 @@ def partial_fit(self, X, y=None): y: Passthrough for ``Pipeline`` compatibility. """ - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, + X = check_array(X, accept_sparse=('csr', 'csc'), copy=False, ensure_2d=False, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) @@ -748,7 +748,7 @@ def partial_fit(self, X, y=None): y: Passthrough for ``Pipeline`` compatibility. """ - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, + X = check_array(X, accept_sparse=('csr', 'csc'), copy=False, ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES) if X.ndim == 1: @@ -772,7 +772,7 @@ def partial_fit(self, X, y=None): self.scale_ = _handle_zeros_in_scale(max_abs) return self - def transform(self, X, y=None): + def transform(self, X, y=None, copy=None): """Scale the data Parameters @@ -781,7 +781,8 @@ def transform(self, X, y=None): The data that should be scaled. """ check_is_fitted(self, 'scale_') - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, + copy = copy if copy is not None else self.copy + X = check_array(X, accept_sparse=('csr', 'csc'), copy=copy, ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES) if X.ndim == 1: @@ -796,7 +797,7 @@ def transform(self, X, y=None): X /= self.scale_ return X - def inverse_transform(self, X): + def inverse_transform(self, X, copy=None): """Scale back the data to the original representation Parameters @@ -805,6 +806,7 @@ def inverse_transform(self, X): The data that should be transformed back. """ check_is_fitted(self, 'scale_') + copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES) if X.ndim == 1: @@ -934,9 +936,9 @@ def __init__(self, with_centering=True, with_scaling=True, copy=True): self.with_scaling = with_scaling self.copy = copy - def _check_array(self, X, copy): + def _check_array(self, X, copy=False): """Makes sure centering is not enabled for sparse matrices.""" - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, + X = check_array(X, accept_sparse=('csr', 'csc'), copy=copy, ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES) if X.ndim == 1: @@ -960,7 +962,7 @@ def fit(self, X, y=None): """ if sparse.issparse(X): raise TypeError("RobustScaler cannot be fitted on sparse inputs") - X = self._check_array(X, self.copy) + X = self._check_array(X, copy=False) if X.ndim == 1: warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) if self.with_centering: @@ -972,7 +974,7 @@ def fit(self, X, y=None): self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) return self - def transform(self, X, y=None): + def transform(self, X, y=None, copy=None): """Center and scale the data Parameters @@ -984,7 +986,8 @@ def transform(self, X, y=None): check_is_fitted(self, 'center_') if self.with_scaling: check_is_fitted(self, 'scale_') - X = self._check_array(X, self.copy) + copy = copy if copy is not None else self.copy + X = self._check_array(X, copy) if X.ndim == 1: warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) @@ -1001,7 +1004,7 @@ def transform(self, X, y=None): X /= self.scale_ return X - def inverse_transform(self, X): + def inverse_transform(self, X, copy=None): """Scale back the data to the original representation Parameters @@ -1013,7 +1016,8 @@ def inverse_transform(self, X): check_is_fitted(self, 'center_') if self.with_scaling: check_is_fitted(self, 'scale_') - X = self._check_array(X, self.copy) + copy = copy if copy is not None else self.copy + X = self._check_array(X, copy) if X.ndim == 1: warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) @@ -1449,7 +1453,7 @@ def fit(self, X, y=None): This method is just there to implement the usual API and hence work in pipelines. """ - check_array(X, accept_sparse='csr') + check_array(X, accept_sparse='csr', copy=False) return self def transform(self, X, y=None, copy=None): diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 0ef23c471bd60..014420aad2a58 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -299,7 +299,7 @@ def _dense_fit(self, X, strategy, missing_values, axis): return most_frequent - def transform(self, X): + def transform(self, X, copy=None): """Impute all missing values in X. Parameters @@ -311,7 +311,8 @@ def transform(self, X): check_is_fitted(self, 'statistics_') # Copy just once - X = as_float_array(X, copy=self.copy, force_all_finite=False) + copy = copy if copy is not None else self.copy + X = as_float_array(X, copy=copy, force_all_finite=False) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data need to be recomputed diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 5a28b31b33c2f..46a6c8c119f8e 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -26,6 +26,7 @@ from sklearn.linear_model.base import LinearClassifierMixin from sklearn.utils.estimator_checks import ( _yield_all_checks, + _clear_temp_memory, CROSS_DECOMPOSITION, check_parameters_default_constructible, check_class_weight_balanced_linear_classifier, @@ -73,6 +74,8 @@ def test_non_meta_estimators(): yield check, name, Estimator else: yield check, name, Estimator + _clear_temp_memory(warn=False) + def test_configure(): # Smoke test the 'configure' step of setup, this tests all the diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4e396db755d39..63aa3beb04b74 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1,10 +1,21 @@ from __future__ import print_function +import os import types import warnings import sys import traceback import pickle +import tempfile +import shutil +# WindowsError only exist in Windows +from nose.tools import assert_false + +try: + WindowsError +except NameError: + WindowsError = None + from copy import deepcopy import numpy as np @@ -52,6 +63,7 @@ BOSTON = None +_TEMP_READONLY_MEMMAP_MEMORY = None CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'] MULTI_OUTPUT = ['CCA', 'DecisionTreeRegressor', 'ElasticNet', 'ExtraTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcess', @@ -79,6 +91,7 @@ def _yield_non_meta_checks(name, Estimator): yield check_fit_score_takes_y yield check_dtype_object yield check_estimators_fit_returns_self + yield check_estimators_fit_returns_self_readonly # Check that all estimator yield informative messages when # trained on empty datasets @@ -118,6 +131,7 @@ def _yield_classifier_checks(name, Classifier): # basic consistency testing yield check_classifiers_train yield check_classifiers_regression_target + yield check_classifiers_train_readonly if (name not in ["MultinomialNB", "LabelPropagation", "LabelSpreading"] # TODO some complication with -1 label and name not in ["DecisionTreeClassifier", @@ -137,6 +151,7 @@ def _yield_regressor_checks(name, Regressor): # TODO: test with multiple responses # basic testing yield check_regressors_train + yield check_regressors_train_readonly yield check_regressor_data_not_an_array yield check_estimators_partial_fit_n_features yield check_regressors_no_decision_function @@ -160,6 +175,7 @@ def _yield_transformer_checks(name, Transformer): 'FunctionTransformer', 'Normalizer']: # basic tests yield check_transformer_general + yield check_transformer_general_readonly yield check_transformers_unfitted @@ -169,6 +185,7 @@ def _yield_clustering_checks(name, Clusterer): # this is clustering on the features # let's not test that here. yield check_clustering + yield check_clustering_readonly yield check_estimators_partial_fit_n_features @@ -216,16 +233,99 @@ def check_estimator(Estimator): check(name, Estimator) -def _boston_subset(n_samples=200): +def _boston_subset(n_samples=200, scale_y=False, convert_y_2d=False): + """Utility function used to cache boston subset into a global variable""" global BOSTON if BOSTON is None: boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) - X, y = X[:n_samples], y[:n_samples] X = StandardScaler().fit_transform(X) BOSTON = X, y - return BOSTON + else: + X, y = BOSTON + X, y = X[:n_samples], y[:n_samples] + if scale_y: + y = StandardScaler().fit_transform(y) + if convert_y_2d: + y = y[:, np.newaxis] + return X, y + + +def _readonly_boston_subset(*args, **kwargs): + """Utility function used to return a r-o memmap, without recreating + a new memory map at each call""" + _init_temp_memory() + f = _TEMP_READONLY_MEMMAP_MEMORY.cache(_boston_subset) + return f(*args, **kwargs) + + +def _boston_subset_with_mode(*args, **kwargs): + """Factorisation function used in checks""" + readonly = kwargs.pop('readonly', None) + if readonly: + return _readonly_boston_subset(*args, **kwargs) + else: + return _boston_subset(*args, **kwargs) + + +def _make_blobs(*args, **kwargs): + """Utility function used to ensure that + we have only positive value for X""" + positive = kwargs.pop('positive', False) + scale = kwargs.pop('scale', False) + shuffle_flag = kwargs.pop('shuffle', False) + X, y = make_blobs(*args, **kwargs) + if scale: + X = StandardScaler().fit_transform(X) + if positive: + X -= X.min() + if shuffle_flag: + X, y = shuffle(X, y, random_state=7) + return X, y + + +def _readonly_make_blobs(*args, **kwargs): + """Utility function used to return a r-o memmap, without recreating + a new memory map at each call""" + _init_temp_memory() + f = _TEMP_READONLY_MEMMAP_MEMORY.cache(_make_blobs) + return f(*args, **kwargs) + + +def _make_blobs_with_mode(*args, **kwargs): + """Factorisation function used in checks""" + readonly = kwargs.pop('readonly', None) + if readonly: + return _readonly_make_blobs(*args, **kwargs) + else: + return _make_blobs(*args, **kwargs) + + +def _init_temp_memory(mmap_mode='r'): + """Utility function used to initialize a temp folder""" + global _TEMP_READONLY_MEMMAP_MEMORY + if _TEMP_READONLY_MEMMAP_MEMORY is None: + temp_folder = tempfile.mkdtemp(prefix='sklearn_checks_temp_') + _TEMP_READONLY_MEMMAP_MEMORY = Memory(cachedir=temp_folder, + mmap_mode=mmap_mode, verbose=0) + # Cannot use atexit as it is called everytime a test ends, + # thus forcing us to regenerate cache at every check + # atexit.register(_clear_temp_memory(warn=True)) + + +def _clear_temp_memory(warn=False): + """Utility function used to delete the local temp folder""" + global _TEMP_READONLY_MEMMAP_MEMORY + if _TEMP_READONLY_MEMMAP_MEMORY is not None: + # Recovering temp_folder + cachedir = os.path.dirname(_TEMP_READONLY_MEMMAP_MEMORY.cachedir) + _TEMP_READONLY_MEMMAP_MEMORY = None + try: + shutil.rmtree(cachedir) + except WindowsError: + if warn: + warnings.warn("Could not delete temporary folder %s" % cachedir) def set_testing_parameters(estimator): @@ -286,7 +386,7 @@ def set_testing_parameters(estimator): class NotAnArray(object): - " An object that is convertable to an array" + """An object that is convertable to an array""" def __init__(self, data): self.data = data @@ -490,10 +590,23 @@ def check_transformer_general(name, Transformer): random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) X -= X.min() + + +def check_transformer_general(name, Transformer, readonly=False): + X, y = _make_blobs_with_mode(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, n_features=2, cluster_std=0.1, + readonly=readonly, positive=True, scale=True) + if readonly: + assert_false(X.flags['WRITEABLE']) _check_transformer(name, Transformer, X, y) _check_transformer(name, Transformer, X.tolist(), y.tolist()) +def check_transformer_general_readonly(name, Transformer): + check_transformer_general(name, Transformer, readonly=True) + + + def check_transformer_data_not_an_array(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) @@ -508,7 +621,7 @@ def check_transformer_data_not_an_array(name, Transformer): def check_transformers_unfitted(name, Transformer): X, y = _boston_subset() - + with warnings.catch_warnings(record=True): transformer = Transformer() @@ -822,10 +935,10 @@ def check_estimators_partial_fit_n_features(name, Alg): assert_raises(ValueError, alg.partial_fit, X[:, :-1], y) -def check_clustering(name, Alg): - X, y = make_blobs(n_samples=50, random_state=1) - X, y = shuffle(X, y, random_state=7) - X = StandardScaler().fit_transform(X) +def check_clustering(name, Alg, readonly=False): + X, y = _make_blobs_with_mode(n_samples=50, random_state=1, + scale=True, + readonly=readonly, shuffle=True) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): @@ -839,6 +952,8 @@ def check_clustering(name, Alg): alg.set_params(max_iter=100) # fit + if readonly: + assert_false(X.flags['WRITEABLE']) alg.fit(X) # with lists alg.fit(X.tolist()) @@ -856,6 +971,10 @@ def check_clustering(name, Alg): assert_array_equal(pred, pred2) +def check_clustering_readonly(name, Alg): + check_clustering(name, Alg, readonly=True) + + def check_clusterer_compute_labels_predict(name, Clusterer): """Check that predict is invariant of compute_labels""" X, y = make_blobs(n_samples=20, random_state=0) @@ -907,13 +1026,20 @@ def check_classifiers_one_label(name, Classifier): @ignore_warnings # Warnings are raised by decision function -def check_classifiers_train(name, Classifier): - X_m, y_m = make_blobs(n_samples=300, random_state=0) - X_m, y_m = shuffle(X_m, y_m, random_state=7) - X_m = StandardScaler().fit_transform(X_m) - # generate binary problem from multi-class one - y_b = y_m[y_m != 2] - X_b = X_m[y_m != 2] +def check_classifiers_train(name, Classifier, readonly=False): + if name in ['BernoulliNB', 'MultinomialNB']: + positive = True + else: + positive = False + X_m, y_m = _make_blobs_with_mode(n_samples=300, + random_state=0, shuffle=True, + readonly=readonly, scale=True, + positive=positive) + # generate binary problem + X_b, y_b = _make_blobs_with_mode(n_samples=300, + random_state=0, shuffle=True, + readonly=readonly, scale=True, + positive=positive, centers=2) for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) @@ -921,14 +1047,15 @@ def check_classifiers_train(name, Classifier): n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() - if name in ['BernoulliNB', 'MultinomialNB']: - X -= X.min() set_testing_parameters(classifier) set_random_state(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit + if readonly: + assert_false(X.flags['WRITEABLE']) + assert_false(y.flags['WRITEABLE']) classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) @@ -977,22 +1104,33 @@ def check_classifiers_train(name, Classifier): assert_raises(ValueError, classifier.predict_proba, X.T) -def check_estimators_fit_returns_self(name, Estimator): +def check_classifiers_train_readonly(name, Classifier): + check_classifiers_train(name, Classifier, readonly=True) + + +def check_estimators_fit_returns_self(name, Estimator, readonly=False): """Check if self is returned when calling fit""" - X, y = make_blobs(random_state=0, n_samples=9, n_features=4) + X, y = _make_blobs_with_mode(random_state=0, n_samples=9, n_features=4, + readonly=readonly, positive=True) y = multioutput_estimator_convert_y_2d(name, y) - # some want non-negative input - X -= X.min() estimator = Estimator() set_testing_parameters(estimator) set_random_state(estimator) + if readonly: + assert_false(X.flags['WRITEABLE']) + assert_false(y.flags['WRITEABLE']) assert_true(estimator.fit(X, y) is estimator) @ignore_warnings +def check_estimators_fit_returns_self_readonly(name, Estimator): + """Check if Estimator.fit does not fail on read only mem-mapped data""" + check_estimators_fit_returns_self(name, Estimator, readonly=True) + + def check_estimators_unfitted(name, Estimator): """Check that predict raises an exception in an unfitted estimator. @@ -1126,11 +1264,19 @@ def check_regressors_int(name, Regressor): assert_array_almost_equal(pred1, pred2, 2, name) -def check_regressors_train(name, Regressor): - X, y = _boston_subset() - y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled - y = y.ravel() - y = multioutput_estimator_convert_y_2d(name, y) +def check_regressors_train_readonly(name, Regressors): + check_regressors_train(name, Regressors, readonly=True) + + +def check_regressors_train(name, Regressor, readonly=False): + # Reproduce multioutput_convert_y_2d for read only boston subset + if name in (['MultiTaskElasticNetCV', 'MultiTaskLassoCV', + 'MultiTaskLasso', 'MultiTaskElasticNet']): + convert_y_2d = True + else: + convert_y_2d = False + X, y = _boston_subset_with_mode(readonly=readonly, scale_y=True, + convert_y_2d=convert_y_2d) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): @@ -1150,7 +1296,12 @@ def check_regressors_train(name, Regressor): y_ = y_.T else: y_ = y + if readonly: + assert_false(X.flags['WRITEABLE']) + assert_false(y_.flags['WRITEABLE']) set_random_state(regressor) + if readonly: + assert_false(X.flags['WRITEABLE']) regressor.fit(X, y_) regressor.fit(X.tolist(), y_.tolist()) y_pred = regressor.predict(X) @@ -1163,6 +1314,32 @@ def check_regressors_train(name, Regressor): assert_greater(regressor.score(X, y_), 0.5) +def check_regressors_pickle(name, Regressor): + X, y = _boston_subset(scale_y=True) + y = multioutput_estimator_convert_y_2d(name, y) + rnd = np.random.RandomState(0) + # catch deprecation warnings + with warnings.catch_warnings(record=True): + regressor = Regressor() + set_testing_parameters(regressor) + if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): + # linear regressors need to set alpha, but not generalized CV ones + regressor.alpha = 0.01 + + if name in CROSS_DECOMPOSITION: + y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) + y_ = y_.T + else: + y_ = y + regressor.fit(X, y_) + y_pred = regressor.predict(X) + # store old predictions + pickled_regressor = pickle.dumps(regressor) + unpickled_regressor = pickle.loads(pickled_regressor) + pickled_y_pred = unpickled_regressor.predict(X) + assert_array_almost_equal(pickled_y_pred, y_pred) + + @ignore_warnings def check_regressors_no_decision_function(name, Regressor): # checks whether regressors have decision_function or predict_proba diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index d577864fb709a..699c248b91803 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -10,7 +10,7 @@ import scipy.sparse as sp from nose.tools import assert_raises, assert_true, assert_false, assert_equal -from sklearn.utils.testing import assert_raises_regexp +from sklearn.utils.testing import assert_raises_regexp, TempMemmap from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_warns @@ -442,6 +442,13 @@ def test_check_is_fitted(): assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_")) +def test_check_array_memmap(): + X = np.ones((4, 4)) + with TempMemmap(X, mmap_mode='r') as X: + Z = check_array(X, copy=False) + assert_true(Z.base is X) + assert_false(Z.flags['WRITEABLE']) + def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e62f3b4ba6d47..a86f172275ab6 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -344,6 +344,8 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None, if isinstance(accept_sparse, str): accept_sparse = [accept_sparse] + # store reference to original array to check if copy is needed when function returns + array_orig = array # store whether originally we wanted numeric dtype dtype_numeric = dtype == "numeric" @@ -381,7 +383,10 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None, array = _ensure_sparse_format(array, accept_sparse, dtype, copy, force_all_finite) else: - array = np.array(array, dtype=dtype, order=order, copy=copy) + # Do not physically copy memory map : if type(array) == np.memmap: + # type(array) == np.array + # array.base is array_orig + array = np.asarray(array, dtype=dtype, order=order) if ensure_2d: if array.ndim == 1: @@ -396,10 +401,8 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None, "X.reshape(1, -1) if it contains a single sample.", DeprecationWarning) array = np.atleast_2d(array) - # To ensure that array flags are maintained - array = np.array(array, dtype=dtype, order=order, copy=copy) - # make sure we acually converted to numeric: + # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: @@ -429,6 +432,10 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None, msg = ("Data with input dtype %s was converted to %s%s." % (dtype_orig, array.dtype, context)) warnings.warn(msg, DataConversionWarning_) + + if copy and array is array_orig: + array = np.array(array, dtype=dtype, order=order) + return array