FIX improve 'precompute' handling in Lars

TomDLT · TomDLT · commit 6af1d841fc05 · 2015-11-10T13:16:48.000+01:00
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
@@ -174,16 +174,19 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
     swap, nrm2 = linalg.get_blas_funcs(('swap', 'nrm2'), (X,))
     solve_cholesky, = get_lapack_funcs(('potrs',), (X,))
 
-    if Gram is None:
+    if Gram is None or Gram is False:
+        Gram = None
         if copy_X:
             # force copy. setting the array to be fortran-ordered
             # speeds up the calculation of the (partial) Gram matrix
             # and allows to easily swap columns
             X = X.copy('F')
-    elif isinstance(Gram, string_types) and Gram == 'auto':
-        Gram = None
-        if X.shape[0] > X.shape[1]:
+
+    elif isinstance(Gram, string_types) and Gram == 'auto' or Gram is True:
+        if Gram is True or X.shape[0] > X.shape[1]:
             Gram = np.dot(X.T, X)
+        else:
+            Gram = None
     elif copy_Gram:
         Gram = Gram.copy()
 
@@ -593,16 +596,14 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True,
         self.copy_X = copy_X
         self.fit_path = fit_path
 
-    def _get_gram(self):
-        # precompute if n_samples > n_features
-        precompute = self.precompute
-        if hasattr(precompute, '__array__'):
-            Gram = precompute
-        elif precompute == 'auto':
-            Gram = 'auto'
-        else:
-            Gram = None
-        return Gram
+    def _get_gram(self, precompute, X, y):
+        if (not hasattr(precompute, '__array__')) and (
+                (precompute is True) or
+                (precompute == 'auto' and X.shape[0] > X.shape[1]) or
+                (precompute == 'auto' and y.shape[1] > 1)):
+            precompute = np.dot(X.T, X)
+
+        return precompute
 
     def fit(self, X, y, Xy=None):
         """Fit the model using X, y as training data.
@@ -645,14 +646,7 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
-        precompute = self.precompute
-        if not hasattr(precompute, '__array__') and (
-                precompute is True or
-                (precompute == 'auto' and X.shape[0] > X.shape[1]) or
-                (precompute == 'auto' and y.shape[1] > 1)):
-            Gram = np.dot(X.T, X)
-        else:
-            Gram = self._get_gram()
+        Gram = self._get_gram(self.precompute, X, y)
 
         self.alphas_ = []
         self.n_iter_ = []
@@ -972,10 +966,10 @@ class LarsCV(Lars):
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
 
-    precompute : True | False | 'auto' | array-like
+    precompute : True | False | 'auto'
         Whether to use a precomputed Gram matrix to speed up
-        calculations. If set to ``'auto'`` let us decide. The Gram
-        matrix can also be passed as argument.
+        calculations. If set to ``'auto'`` let us decide. The Gram matrix
+        cannot be passed as argument since we will use only subsets of X.
 
     max_iter: integer, optional
         Maximum number of iterations to perform.
@@ -1081,7 +1075,13 @@ def fit(self, X, y):
         # init cross-validation generator
         cv = check_cv(self.cv, classifier=False)
 
-        Gram = 'auto' if self.precompute else None
+        # As we use cross-validation, the Gram matrix is not precomputed here
+        Gram = self.precompute
+        if hasattr(Gram, '__array__'):
+            warnings.warn("Parameter 'precompute' cannot be an array in "
+                          "%s. Automatically switch to 'auto' instead."
+                          % self.__class__.__name__)
+            Gram = 'auto'
 
         cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
             delayed(_lars_path_residues)(
@@ -1171,10 +1171,10 @@ class LassoLarsCV(LarsCV):
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
 
-    precompute : True | False | 'auto' | array-like
+    precompute : True | False | 'auto'
         Whether to use a precomputed Gram matrix to speed up
-        calculations. If set to ``'auto'`` let us decide. The Gram
-        matrix can also be passed as argument.
+        calculations. If set to ``'auto'`` let us decide. The Gram matrix
+        cannot be passed as argument since we will use only subsets of X.
 
     max_iter : integer, optional
         Maximum number of iterations to perform.
@@ -1404,7 +1404,7 @@ def fit(self, X, y, copy_X=True):
             X, y, self.fit_intercept, self.normalize, self.copy_X)
         max_iter = self.max_iter
 
-        Gram = self._get_gram()
+        Gram = self.precompute
 
         alphas_, active_, coef_path_, self.n_iter_ = lars_path(
             X, y, Gram=Gram, copy_X=copy_X, copy_Gram=True, alpha_min=0.0,
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
@@ -165,6 +165,7 @@ def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
     alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))
 
     X = (1 - weights) * X
+
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', ConvergenceWarning)
         alphas_, _, coef_ = lars_path(X, y,
@@ -226,10 +227,11 @@ class RandomizedLasso(BaseRandomizedLinearModel):
     normalize : boolean, optional, default True
         If True, the regressors X will be normalized before regression.
 
-    precompute : True | False | 'auto'
-        Whether to use a precomputed Gram matrix to speed up
-        calculations. If set to 'auto' let us decide. The Gram
-        matrix can also be passed as argument.
+    precompute : True | False | 'auto' | array-like
+        Whether to use a precomputed Gram matrix to speed up calculations.
+        If set to 'auto' let us decide.
+        The Gram matrix can also be passed as argument, but it will be used
+        only for the selection of parameter alpha, if alpha is 'aic' or 'bic'.
 
     max_iter : integer, optional
         Maximum number of iterations to perform in the Lars algorithm.
@@ -328,7 +330,6 @@ def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75,
         self.memory = memory
 
     def _make_estimator_and_params(self, X, y):
-        assert self.precompute in (True, False, None, 'auto')
         alpha = self.alpha
         if alpha in ('aic', 'bic'):
             model = LassoLarsIC(precompute=self.precompute,
@@ -337,9 +338,16 @@ def _make_estimator_and_params(self, X, y):
                                 eps=self.eps)
             model.fit(X, y)
             self.alpha_ = alpha = model.alpha_
+
+        precompute = self.precompute
+        # A precomputed Gram array is useless, since _randomized_lasso
+        # change X a each iteration
+        if hasattr(precompute, '__array__'):
+            precompute = 'auto'
+        assert precompute in (True, False, None, 'auto')
         return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter,
                                        eps=self.eps,
-                                       precompute=self.precompute)
+                                       precompute=precompute)
 
 
 ###############################################################################
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
@@ -169,6 +169,20 @@ def test_no_path_all_precomputed():
     assert_true(alpha_ == alphas_[-1])
 
 
+def test_lars_precompute():
+    # Check for different values of precompute
+    X, y = diabetes.data, diabetes.target
+    G = np.dot(X.T, X)
+    for classifier in [linear_model.Lars, linear_model.LarsCV,
+                       linear_model.LassoLarsIC]:
+        clf = classifier(precompute=G)
+        output_1 = ignore_warnings(clf.fit)(X, y).coef_
+        for precompute in [True, False, 'auto', None]:
+            clf = classifier(precompute=precompute)
+            output_2 = clf.fit(X, y).coef_
+            assert_array_almost_equal(output_1, output_2, decimal=8)
+
+
 def test_singular_matrix():
     # Test when input is a singular matrix
     X1 = np.array([[1, 1.], [1., 1.]])
diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py
@@ -42,17 +42,18 @@ def test_randomized_lasso():
     # Check randomized lasso
     scaling = 0.3
     selection_threshold = 0.5
+    n_resampling = 20
 
     # or with 1 alpha
     clf = RandomizedLasso(verbose=False, alpha=1, random_state=42,
-                          scaling=scaling,
+                          scaling=scaling, n_resampling=n_resampling,
                           selection_threshold=selection_threshold)
     feature_scores = clf.fit(X, y).scores_
     assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
 
     # or with many alphas
     clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
-                          scaling=scaling,
+                          scaling=scaling, n_resampling=n_resampling,
                           selection_threshold=selection_threshold)
     feature_scores = clf.fit(X, y).scores_
     assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
@@ -64,7 +65,7 @@ def test_randomized_lasso():
     assert_equal(X_full.shape, X.shape)
 
     clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42,
-                          scaling=scaling)
+                          scaling=scaling, n_resampling=n_resampling)
     feature_scores = clf.fit(X, y).scores_
     assert_array_equal(feature_scores, X.shape[1] * [1.])
 
@@ -75,6 +76,25 @@ def test_randomized_lasso():
     assert_raises(ValueError, clf.fit, X, y)
 
 
+def test_randomized_lasso_precompute():
+    # Check randomized lasso for different values of precompute
+    n_resampling = 20
+    alpha = 1
+    random_state = 42
+
+    G = np.dot(X.T, X)
+
+    clf = RandomizedLasso(alpha=alpha, random_state=random_state,
+                          precompute=G, n_resampling=n_resampling)
+    feature_scores_1 = clf.fit(X, y).scores_
+
+    for precompute in [True, False, None, 'auto']:
+        clf = RandomizedLasso(alpha=alpha, random_state=random_state,
+                              precompute=precompute, n_resampling=n_resampling)
+        feature_scores_2 = clf.fit(X, y).scores_
+        assert_array_equal(feature_scores_1, feature_scores_2)
+
+
 def test_randomized_logistic():
     # Check randomized sparse logistic regression
     iris = load_iris()