ENH Preserving dtype for numpy.float32 in Least Angle Regression (#20155)

takoika · web-flow · commit c8753d4174be · 2021-05-31T16:33:03.000+02:00
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -344,6 +344,10 @@ Changelog
   is now faster. This is especially noticeable on large sparse input.
   :pr:`19734` by :user:`Fred Robinson <frrad>`.
 
+- |Enhancement| `fit` method preserves dtype for numpy.float32 in
+  :class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and
+  :class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura <takoika>`.
+
 :mod:`sklearn.manifold`
 .......................
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
@@ -476,12 +476,23 @@ def _lars_path_solver(
 
     max_features = min(max_iter, n_features)
 
+    dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None)
+    if len(dtypes) == 1:
+        # use the precision level of input data if it is consistent
+        return_dtype = next(iter(dtypes))
+    else:
+        # fallback to double precision otherwise
+        return_dtype = np.float64
+
     if return_path:
-        coefs = np.zeros((max_features + 1, n_features))
-        alphas = np.zeros(max_features + 1)
+        coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype)
+        alphas = np.zeros(max_features + 1, dtype=return_dtype)
     else:
-        coef, prev_coef = np.zeros(n_features), np.zeros(n_features)
-        alpha, prev_alpha = np.array([0.]), np.array([0.])  # better ideas?
+        coef, prev_coef = (np.zeros(n_features, dtype=return_dtype),
+                           np.zeros(n_features, dtype=return_dtype))
+        alpha, prev_alpha = (np.array([0.], dtype=return_dtype),
+                             np.array([0.], dtype=return_dtype))
+        # above better ideas?
 
     n_iter, n_active = 0, 0
     active, indices = list(), np.arange(n_features)
@@ -948,7 +959,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None):
 
         self.alphas_ = []
         self.n_iter_ = []
-        self.coef_ = np.empty((n_targets, n_features))
+        self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype)
 
         if fit_path:
             self.active_ = []
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
@@ -14,7 +14,7 @@
 from sklearn import linear_model, datasets
 from sklearn.linear_model._least_angle import _lars_path_residues
 from sklearn.linear_model import LassoLarsIC, lars_path
-from sklearn.linear_model import Lars, LassoLars
+from sklearn.linear_model import Lars, LassoLars, LarsCV, LassoLarsCV
 
 # TODO: use another dataset that has multiple drops
 diabetes = datasets.load_diabetes()
@@ -777,3 +777,54 @@ def test_copy_X_with_auto_gram():
     linear_model.lars_path(X, y, Gram='auto', copy_X=True, method='lasso')
     # X did not change
     assert_allclose(X, X_before)
+
+
+@pytest.mark.parametrize("LARS, has_coef_path, args",
+                         ((Lars, True, {}),
+                          (LassoLars, True, {}),
+                          (LassoLarsIC, False, {}),
+                          (LarsCV, True, {}),
+                          # max_iter=5 is for avoiding ConvergenceWarning
+                          (LassoLarsCV, True, {"max_iter": 5})))
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
+    # The test ensures that the fit method preserves input dtype
+    rng = np.random.RandomState(0)
+    X = rng.rand(6, 6).astype(dtype)
+    y = rng.rand(6).astype(dtype)
+
+    model = LARS(**args)
+    model.fit(X, y)
+    assert model.coef_.dtype == dtype
+    if has_coef_path:
+        assert model.coef_path_.dtype == dtype
+    assert model.intercept_.dtype == dtype
+
+
+@pytest.mark.parametrize("LARS, has_coef_path, args",
+                         ((Lars, True, {}),
+                          (LassoLars, True, {}),
+                          (LassoLarsIC, False, {}),
+                          (LarsCV, True, {}),
+                          # max_iter=5 is for avoiding ConvergenceWarning
+                          (LassoLarsCV, True, {"max_iter": 5})))
+def test_lars_numeric_consistency(LARS, has_coef_path, args):
+    # The test ensures numerical consistency between trained coefficients
+    # of float32 and float64.
+    rtol = 1e-5
+    atol = 1e-5
+
+    rng = np.random.RandomState(0)
+    X_64 = rng.rand(6, 6)
+    y_64 = rng.rand(6)
+
+    model_64 = LARS(**args).fit(X_64, y_64)
+    model_32 = LARS(**args).fit(X_64.astype(np.float32),
+                                y_64.astype(np.float32))
+
+    assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
+    if has_coef_path:
+        assert_allclose(model_64.coef_path_, model_32.coef_path_,
+                        rtol=rtol, atol=atol)
+    assert_allclose(model_64.intercept_, model_32.intercept_,
+                    rtol=rtol, atol=atol)