Skip to content

Commit 408f561

Browse files
authored
EFF Speed-up MiniBatchDictionaryLearning by avoiding multiple validation (#25490)
1 parent 20ad9cd commit 408f561

File tree

2 files changed

+73
-55
lines changed

2 files changed

+73
-55
lines changed

doc/whats_new/v1.3.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ Changelog
117117
`callback` for consistency with the function :func:`decomposition.dict_learning`.
118118
:pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
119119

120+
- |Efficiency| :class:`decomposition.MiniBatchDictionaryLearning` and
121+
:class:`decomposition.MiniBatchSparsePCA` are now faster for small batch sizes by
122+
avoiding duplicate validations.
123+
:pr:`25490` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
124+
120125
:mod:`sklearn.ensemble`
121126
.......................
122127

sklearn/decomposition/_dict_learning.py

Lines changed: 68 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -32,23 +32,23 @@ def _check_positive_coding(method, positive):
3232
)
3333

3434

35-
def _sparse_encode(
35+
def _sparse_encode_precomputed(
3636
X,
3737
dictionary,
38-
gram,
38+
*,
39+
gram=None,
3940
cov=None,
4041
algorithm="lasso_lars",
4142
regularization=None,
4243
copy_cov=True,
4344
init=None,
4445
max_iter=1000,
45-
check_input=True,
4646
verbose=0,
4747
positive=False,
4848
):
49-
"""Generic sparse coding.
49+
"""Generic sparse coding with precomputed Gram and/or covariance matrices.
5050
51-
Each column of the result is the solution to a Lasso problem.
51+
Each row of the result is the solution to a Lasso problem.
5252
5353
Parameters
5454
----------
@@ -59,7 +59,7 @@ def _sparse_encode(
5959
The dictionary matrix against which to solve the sparse coding of
6060
the data. Some of the algorithms assume normalized rows.
6161
62-
gram : ndarray of shape (n_components, n_components) or None
62+
gram : ndarray of shape (n_components, n_components), default=None
6363
Precomputed Gram matrix, `dictionary * dictionary'`
6464
gram can be `None` if method is 'threshold'.
6565
@@ -98,9 +98,6 @@ def _sparse_encode(
9898
Whether to copy the precomputed covariance matrix; if `False`, it may
9999
be overwritten.
100100
101-
check_input : bool, default=True
102-
If `False`, the input arrays `X` and dictionary will not be checked.
103-
104101
verbose : int, default=0
105102
Controls the verbosity; the higher, the more messages.
106103
@@ -113,29 +110,9 @@ def _sparse_encode(
113110
-------
114111
code : ndarray of shape (n_components, n_features)
115112
The sparse codes.
116-
117-
See Also
118-
--------
119-
sklearn.linear_model.lars_path
120-
sklearn.linear_model.orthogonal_mp
121-
sklearn.linear_model.Lasso
122-
SparseCoder
123113
"""
124-
if X.ndim == 1:
125-
X = X[:, np.newaxis]
126114
n_samples, n_features = X.shape
127115
n_components = dictionary.shape[0]
128-
if dictionary.shape[1] != X.shape[1]:
129-
raise ValueError(
130-
"Dictionary and X have different numbers of features:"
131-
"dictionary.shape: {} X.shape{}".format(dictionary.shape, X.shape)
132-
)
133-
if cov is None and algorithm != "lasso_cd":
134-
# overwriting cov is safe
135-
copy_cov = False
136-
cov = np.dot(dictionary, X.T)
137-
138-
_check_positive_coding(algorithm, positive)
139116

140117
if algorithm == "lasso_lars":
141118
alpha = float(regularization) / n_features # account for scaling
@@ -183,7 +160,7 @@ def _sparse_encode(
183160
init = np.array(init)
184161
clf.coef_ = init
185162

186-
clf.fit(dictionary.T, X.T, check_input=check_input)
163+
clf.fit(dictionary.T, X.T, check_input=False)
187164
new_code = clf.coef_
188165

189166
elif algorithm == "lars":
@@ -218,14 +195,8 @@ def _sparse_encode(
218195
norms_squared=row_norms(X, squared=True),
219196
copy_Xy=copy_cov,
220197
).T
221-
else:
222-
raise ValueError(
223-
'Sparse coding method must be "lasso_lars" '
224-
'"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm
225-
)
226-
if new_code.ndim != 2:
227-
return new_code.reshape(n_samples, n_components)
228-
return new_code
198+
199+
return new_code.reshape(n_samples, n_components)
229200

230201

231202
@validate_params(
@@ -375,15 +346,51 @@ def sparse_encode(
375346
dictionary = check_array(dictionary)
376347
X = check_array(X)
377348

378-
n_samples, n_features = X.shape
379-
n_components = dictionary.shape[0]
349+
if dictionary.shape[1] != X.shape[1]:
350+
raise ValueError(
351+
"Dictionary and X have different numbers of features:"
352+
"dictionary.shape: {} X.shape{}".format(dictionary.shape, X.shape)
353+
)
380354

381-
if gram is None and algorithm != "threshold":
382-
gram = np.dot(dictionary, dictionary.T)
355+
_check_positive_coding(algorithm, positive)
383356

384-
if cov is None and algorithm != "lasso_cd":
385-
copy_cov = False
386-
cov = np.dot(dictionary, X.T)
357+
return _sparse_encode(
358+
X,
359+
dictionary,
360+
gram=gram,
361+
cov=cov,
362+
algorithm=algorithm,
363+
n_nonzero_coefs=n_nonzero_coefs,
364+
alpha=alpha,
365+
copy_cov=copy_cov,
366+
init=init,
367+
max_iter=max_iter,
368+
n_jobs=n_jobs,
369+
verbose=verbose,
370+
positive=positive,
371+
)
372+
373+
374+
def _sparse_encode(
375+
X,
376+
dictionary,
377+
*,
378+
gram=None,
379+
cov=None,
380+
algorithm="lasso_lars",
381+
n_nonzero_coefs=None,
382+
alpha=None,
383+
copy_cov=True,
384+
init=None,
385+
max_iter=1000,
386+
n_jobs=None,
387+
verbose=0,
388+
positive=False,
389+
):
390+
"""Sparse coding without input/parameter validation."""
391+
392+
n_samples, n_features = X.shape
393+
n_components = dictionary.shape[0]
387394

388395
if algorithm in ("lars", "omp"):
389396
regularization = n_nonzero_coefs
@@ -394,39 +401,46 @@ def sparse_encode(
394401
if regularization is None:
395402
regularization = 1.0
396403

404+
if gram is None and algorithm != "threshold":
405+
gram = np.dot(dictionary, dictionary.T)
406+
407+
if cov is None and algorithm != "lasso_cd":
408+
copy_cov = False
409+
cov = np.dot(dictionary, X.T)
410+
397411
if effective_n_jobs(n_jobs) == 1 or algorithm == "threshold":
398-
code = _sparse_encode(
412+
code = _sparse_encode_precomputed(
399413
X,
400414
dictionary,
401-
gram,
415+
gram=gram,
402416
cov=cov,
403417
algorithm=algorithm,
404418
regularization=regularization,
405419
copy_cov=copy_cov,
406420
init=init,
407421
max_iter=max_iter,
408-
check_input=False,
409422
verbose=verbose,
410423
positive=positive,
411424
)
412425
return code
413426

414427
# Enter parallel code block
428+
n_samples = X.shape[0]
429+
n_components = dictionary.shape[0]
415430
code = np.empty((n_samples, n_components))
416431
slices = list(gen_even_slices(n_samples, effective_n_jobs(n_jobs)))
417432

418433
code_views = Parallel(n_jobs=n_jobs, verbose=verbose)(
419-
delayed(_sparse_encode)(
434+
delayed(_sparse_encode_precomputed)(
420435
X[this_slice],
421436
dictionary,
422-
gram,
423-
cov[:, this_slice] if cov is not None else None,
424-
algorithm,
437+
gram=gram,
438+
cov=cov[:, this_slice] if cov is not None else None,
439+
algorithm=algorithm,
425440
regularization=regularization,
426441
copy_cov=copy_cov,
427442
init=init[this_slice] if init is not None else None,
428443
max_iter=max_iter,
429-
check_input=False,
430444
verbose=verbose,
431445
positive=positive,
432446
)
@@ -2205,13 +2219,12 @@ def _minibatch_step(self, X, dictionary, random_state, step):
22052219
batch_size = X.shape[0]
22062220

22072221
# Compute code for this batch
2208-
code = sparse_encode(
2222+
code = _sparse_encode(
22092223
X,
22102224
dictionary,
22112225
algorithm=self._fit_algorithm,
22122226
alpha=self.alpha,
22132227
n_jobs=self.n_jobs,
2214-
check_input=False,
22152228
positive=self.positive_code,
22162229
max_iter=self.transform_max_iter,
22172230
verbose=self.verbose,

0 commit comments

Comments
 (0)