|
48 | 48 | assert_almost_equal,
|
49 | 49 | assert_array_almost_equal,
|
50 | 50 | assert_array_equal,
|
| 51 | + assert_array_less, |
51 | 52 | ignore_warnings,
|
52 | 53 | )
|
53 | 54 | from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
|
@@ -1304,55 +1305,78 @@ def test_enet_sample_weight_consistency(
|
1304 | 1305 |
|
1305 | 1306 | @pytest.mark.parametrize("fit_intercept", [True, False])
|
1306 | 1307 | @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS)
|
1307 |
| -def test_enet_cv_sample_weight_correctness(fit_intercept, sparse_container): |
1308 |
| - """Test that ElasticNetCV with sample weights gives correct results.""" |
1309 |
| - rng = np.random.RandomState(42) |
1310 |
| - n_splits, n_samples, n_features = 3, 10, 5 |
1311 |
| - X = rng.rand(n_splits * n_samples, n_features) |
| 1308 | +def test_enet_cv_sample_weight_correctness( |
| 1309 | + fit_intercept, sparse_container, global_random_seed |
| 1310 | +): |
| 1311 | + """Test that ElasticNetCV with sample weights gives correct results. |
| 1312 | +
|
| 1313 | + We fit the same model twice, once with weighted training data, once with repeated |
| 1314 | + data points in the training data and check that both models converge to the |
| 1315 | + same solution. |
| 1316 | +
|
| 1317 | + Since this model uses an internal cross-validation scheme to tune the alpha |
| 1318 | + regularization parameter, we make sure that the repetitions only occur within |
| 1319 | + a specific CV group. Data points belonging to other CV groups stay |
| 1320 | + unit-weighted / "unrepeated". |
| 1321 | + """ |
| 1322 | + rng = np.random.RandomState(global_random_seed) |
| 1323 | + n_splits, n_samples_per_cv, n_features = 3, 10, 5 |
| 1324 | + X_with_weights = rng.rand(n_splits * n_samples_per_cv, n_features) |
1312 | 1325 | beta = rng.rand(n_features)
|
1313 | 1326 | beta[0:2] = 0
|
1314 |
| - y = X @ beta + rng.rand(n_splits * n_samples) |
1315 |
| - sw = np.ones_like(y) |
| 1327 | + y_with_weights = X_with_weights @ beta + rng.rand(n_splits * n_samples_per_cv) |
| 1328 | + |
1316 | 1329 | if sparse_container is not None:
|
1317 |
| - X = sparse_container(X) |
| 1330 | + X_with_weights = sparse_container(X_with_weights) |
1318 | 1331 | params = dict(tol=1e-6)
|
1319 | 1332 |
|
1320 |
| - # Set alphas, otherwise the two cv models might use different ones. |
1321 |
| - if fit_intercept: |
1322 |
| - alphas = np.linspace(0.001, 0.01, num=91) |
1323 |
| - else: |
1324 |
| - alphas = np.linspace(0.01, 0.1, num=91) |
1325 |
| - |
1326 |
| - # We weight the first fold 2 times more. |
1327 |
| - sw[:n_samples] = 2 |
1328 |
| - groups_sw = np.r_[ |
1329 |
| - np.full(n_samples, 0), np.full(n_samples, 1), np.full(n_samples, 2) |
1330 |
| - ] |
1331 |
| - splits_sw = list(LeaveOneGroupOut().split(X, groups=groups_sw)) |
1332 |
| - reg_sw = ElasticNetCV( |
1333 |
| - alphas=alphas, cv=splits_sw, fit_intercept=fit_intercept, **params |
| 1333 | + # Assign random integer weights only to the first cross-validation group. |
| 1334 | + # The samples in the other cross-validation groups are left with unit |
| 1335 | + # weights. |
| 1336 | + |
| 1337 | + sw = np.ones_like(y_with_weights) |
| 1338 | + sw[:n_samples_per_cv] = rng.randint(0, 5, size=n_samples_per_cv) |
| 1339 | + groups_with_weights = np.concatenate( |
| 1340 | + [ |
| 1341 | + np.full(n_samples_per_cv, 0), |
| 1342 | + np.full(n_samples_per_cv, 1), |
| 1343 | + np.full(n_samples_per_cv, 2), |
| 1344 | + ] |
| 1345 | + ) |
| 1346 | + splits_with_weights = list( |
| 1347 | + LeaveOneGroupOut().split(X_with_weights, groups=groups_with_weights) |
| 1348 | + ) |
| 1349 | + reg_with_weights = ElasticNetCV( |
| 1350 | + cv=splits_with_weights, fit_intercept=fit_intercept, **params |
1334 | 1351 | )
|
1335 |
| - reg_sw.fit(X, y, sample_weight=sw) |
1336 | 1352 |
|
1337 |
| - # We repeat the first fold 2 times and provide splits ourselves |
| 1353 | + reg_with_weights.fit(X_with_weights, y_with_weights, sample_weight=sw) |
| 1354 | + |
1338 | 1355 | if sparse_container is not None:
|
1339 |
| - X = X.toarray() |
1340 |
| - X = np.r_[X[:n_samples], X] |
| 1356 | + X_with_weights = X_with_weights.toarray() |
| 1357 | + X_with_repetitions = np.repeat(X_with_weights, sw.astype(int), axis=0) |
1341 | 1358 | if sparse_container is not None:
|
1342 |
| - X = sparse_container(X) |
1343 |
| - y = np.r_[y[:n_samples], y] |
1344 |
| - groups = np.r_[ |
1345 |
| - np.full(2 * n_samples, 0), np.full(n_samples, 1), np.full(n_samples, 2) |
1346 |
| - ] |
1347 |
| - splits = list(LeaveOneGroupOut().split(X, groups=groups)) |
1348 |
| - reg = ElasticNetCV(alphas=alphas, cv=splits, fit_intercept=fit_intercept, **params) |
1349 |
| - reg.fit(X, y) |
| 1359 | + X_with_repetitions = sparse_container(X_with_repetitions) |
| 1360 | + |
| 1361 | + y_with_repetitions = np.repeat(y_with_weights, sw.astype(int), axis=0) |
| 1362 | + groups_with_repetitions = np.repeat(groups_with_weights, sw.astype(int), axis=0) |
| 1363 | + |
| 1364 | + splits_with_repetitions = list( |
| 1365 | + LeaveOneGroupOut().split(X_with_repetitions, groups=groups_with_repetitions) |
| 1366 | + ) |
| 1367 | + reg_with_repetitions = ElasticNetCV( |
| 1368 | + cv=splits_with_repetitions, fit_intercept=fit_intercept, **params |
| 1369 | + ) |
| 1370 | + reg_with_repetitions.fit(X_with_repetitions, y_with_repetitions) |
1350 | 1371 |
|
1351 |
| - # ensure that we chose meaningful alphas, i.e. not boundaries |
1352 |
| - assert alphas[0] < reg.alpha_ < alphas[-1] |
1353 |
| - assert reg_sw.alpha_ == reg.alpha_ |
1354 |
| - assert_allclose(reg_sw.coef_, reg.coef_) |
1355 |
| - assert reg_sw.intercept_ == pytest.approx(reg.intercept_) |
| 1372 | + # Check that the alpha selection process is the same: |
| 1373 | + assert_allclose(reg_with_weights.mse_path_, reg_with_repetitions.mse_path_) |
| 1374 | + assert_allclose(reg_with_weights.alphas_, reg_with_repetitions.alphas_) |
| 1375 | + assert reg_with_weights.alpha_ == pytest.approx(reg_with_repetitions.alpha_) |
| 1376 | + |
| 1377 | + # Check that the final model coefficients are the same: |
| 1378 | + assert_allclose(reg_with_weights.coef_, reg_with_repetitions.coef_, atol=1e-10) |
| 1379 | + assert reg_with_weights.intercept_ == pytest.approx(reg_with_repetitions.intercept_) |
1356 | 1380 |
|
1357 | 1381 |
|
1358 | 1382 | @pytest.mark.parametrize("sample_weight", [False, True])
|
@@ -1444,9 +1468,29 @@ def test_enet_cv_sample_weight_consistency(
|
1444 | 1468 | assert_allclose(reg.intercept_, intercept)
|
1445 | 1469 |
|
1446 | 1470 |
|
| 1471 | +@pytest.mark.parametrize("X_is_sparse", [False, True]) |
| 1472 | +@pytest.mark.parametrize("fit_intercept", [False, True]) |
| 1473 | +@pytest.mark.parametrize("sample_weight", [np.array([10, 1, 10, 1]), None]) |
| 1474 | +def test_enet_alpha_max_sample_weight(X_is_sparse, fit_intercept, sample_weight): |
| 1475 | + X = np.array([[3.0, 1.0], [2.0, 5.0], [5.0, 3.0], [1.0, 4.0]]) |
| 1476 | + beta = np.array([1, 1]) |
| 1477 | + y = X @ beta |
| 1478 | + if X_is_sparse: |
| 1479 | + X = sparse.csc_matrix(X) |
| 1480 | + # Test alpha_max makes coefs zero. |
| 1481 | + reg = ElasticNetCV(n_alphas=1, cv=2, eps=1, fit_intercept=fit_intercept) |
| 1482 | + reg.fit(X, y, sample_weight=sample_weight) |
| 1483 | + assert_allclose(reg.coef_, 0, atol=1e-5) |
| 1484 | + alpha_max = reg.alpha_ |
| 1485 | + # Test smaller alpha makes coefs nonzero. |
| 1486 | + reg = ElasticNet(alpha=0.99 * alpha_max, fit_intercept=fit_intercept) |
| 1487 | + reg.fit(X, y, sample_weight=sample_weight) |
| 1488 | + assert_array_less(1e-3, np.max(np.abs(reg.coef_))) |
| 1489 | + |
| 1490 | + |
1447 | 1491 | @pytest.mark.parametrize("estimator", [ElasticNetCV, LassoCV])
|
1448 | 1492 | def test_linear_models_cv_fit_with_loky(estimator):
|
1449 |
| - # LinearModelsCV.fit performs inplace operations on fancy-indexed memmapped |
| 1493 | + # LinearModelsCV.fit performs operations on fancy-indexed memmapped |
1450 | 1494 | # data when using the loky backend, causing an error due to unexpected
|
1451 | 1495 | # behavior of fancy indexing of read-only memmaps (cf. numpy#14132).
|
1452 | 1496 |
|
|
0 commit comments