Skip to content

Commit 78ce989

Browse files
author
giorgiop
committed
partial_fit for scalers
1 parent 9ebc6fe commit 78ce989

File tree

12 files changed

+13058
-1658
lines changed

12 files changed

+13058
-1658
lines changed

doc/modules/preprocessing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ This class is hence suitable for use in the early steps of a
7878
>>> scaler.mean_ # doctest: +ELLIPSIS
7979
array([ 1. ..., 0. ..., 0.33...])
8080

81-
>>> scaler.std_ # doctest: +ELLIPSIS
81+
>>> scaler.scale_ # doctest: +ELLIPSIS
8282
array([ 0.81..., 0.81..., 1.24...])
8383

8484
>>> scaler.transform(X) # doctest: +ELLIPSIS

sklearn/decomposition/incremental_pca.py

+47-38
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
"""Incremental Principal Components Analysis."""
22

33
# Author: Kyle Kastner <kastnerkyle@gmail.com>
4+
# Giorgio Patrini
45
# License: BSD 3 clause
56

67
import numpy as np
78
from scipy import linalg
89

910
from .base import _BasePCA
1011
from ..utils import check_array, gen_batches
11-
from ..utils.extmath import svd_flip, _batch_mean_variance_update
12+
from ..utils.extmath import svd_flip, _incremental_mean_and_var
1213

1314

1415
class IncrementalPCA(_BasePCA):
@@ -76,7 +77,8 @@ class IncrementalPCA(_BasePCA):
7677
Per-feature empirical mean, aggregate over calls to ``partial_fit``.
7778
7879
var_ : array, shape (n_features,)
79-
Per-feature empirical variance, aggregate over calls to ``partial_fit``.
80+
Per-feature empirical variance, aggregate over calls to
81+
``partial_fit``.
8082
8183
noise_variance_ : float
8284
The estimated noise covariance following the Probabilistic PCA model
@@ -85,7 +87,8 @@ class IncrementalPCA(_BasePCA):
8587
http://www.miketipping.com/papers/met-mppca.pdf.
8688
8789
n_components_ : int
88-
The estimated number of components. Relevant when ``n_components=None``.
90+
The estimated number of components. Relevant when
91+
``n_components=None``.
8992
9093
n_samples_seen_ : int
9194
The number of samples processed by the estimator. Will be reset on
@@ -157,14 +160,15 @@ def fit(self, X, y=None):
157160
Returns the instance itself.
158161
"""
159162
self.components_ = None
160-
self.mean_ = None
163+
self.n_samples_seen_ = 0
164+
self.mean_ = .0
165+
self.var_ = .0
161166
self.singular_values_ = None
162167
self.explained_variance_ = None
163168
self.explained_variance_ratio_ = None
164169
self.noise_variance_ = None
165-
self.var_ = None
166-
self.n_samples_seen_ = 0
167-
X = check_array(X, dtype=np.float)
170+
171+
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
168172
n_samples, n_features = X.shape
169173

170174
if self.batch_size is None:
@@ -173,10 +177,11 @@ def fit(self, X, y=None):
173177
self.batch_size_ = self.batch_size
174178

175179
for batch in gen_batches(n_samples, self.batch_size_):
176-
self.partial_fit(X[batch])
180+
self.partial_fit(X[batch], check_input=False)
181+
177182
return self
178183

179-
def partial_fit(self, X, y=None):
184+
def partial_fit(self, X, y=None, check_input=True):
180185
"""Incremental fit with X. All of X is processed as a single batch.
181186
182187
Parameters
@@ -190,7 +195,8 @@ def partial_fit(self, X, y=None):
190195
self: object
191196
Returns the instance itself.
192197
"""
193-
X = check_array(X, copy=self.copy, dtype=np.float)
198+
if check_input:
199+
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
194200
n_samples, n_features = X.shape
195201
if not hasattr(self, 'components_'):
196202
self.components_ = None
@@ -204,42 +210,45 @@ def partial_fit(self, X, y=None):
204210
else:
205211
self.n_components_ = self.n_components
206212

207-
if (self.components_ is not None) and (self.components_.shape[0]
208-
!= self.n_components_):
213+
if (self.components_ is not None) and (self.components_.shape[0] !=
214+
self.n_components_):
209215
raise ValueError("Number of input features has changed from %i "
210216
"to %i between calls to partial_fit! Try "
211-
"setting n_components to a fixed value." % (
212-
self.components_.shape[0], self.n_components_))
217+
"setting n_components to a fixed value." %
218+
(self.components_.shape[0], self.n_components_))
213219

214-
if self.components_ is None:
215-
# This is the first pass through partial_fit
220+
# This is the first partial_fit
221+
if not hasattr(self, 'n_samples_seen_'):
216222
self.n_samples_seen_ = 0
217-
col_var = X.var(axis=0)
218-
col_mean = X.mean(axis=0)
223+
self.mean_ = .0
224+
self.var_ = .0
225+
226+
# Update stats - they are 0 if this is the fisrt step
227+
col_mean, col_var, n_total_samples = \
228+
_incremental_mean_and_var(X, last_mean=self.mean_,
229+
last_variance=self.var_,
230+
last_sample_count=self.n_samples_seen_)
231+
232+
# Whitening
233+
if self.n_samples_seen_ == 0:
234+
# If it is the first step, simply whiten X
219235
X -= col_mean
220-
U, S, V = linalg.svd(X, full_matrices=False)
221-
U, V = svd_flip(U, V, u_based_decision=False)
222-
explained_variance = S ** 2 / n_samples
223-
explained_variance_ratio = S ** 2 / np.sum(col_var *
224-
n_samples)
225236
else:
226-
col_batch_mean = X.mean(axis=0)
227-
col_mean, col_var, n_total_samples = _batch_mean_variance_update(
228-
X, self.mean_, self.var_, self.n_samples_seen_)
237+
col_batch_mean = np.mean(X, axis=0)
229238
X -= col_batch_mean
230239
# Build matrix of combined previous basis and new data
231-
mean_correction = np.sqrt((self.n_samples_seen_ * n_samples) /
232-
n_total_samples) * (self.mean_ -
233-
col_batch_mean)
234-
X_combined = np.vstack((self.singular_values_.reshape((-1, 1)) *
235-
self.components_, X,
236-
mean_correction))
237-
U, S, V = linalg.svd(X_combined, full_matrices=False)
238-
U, V = svd_flip(U, V, u_based_decision=False)
239-
explained_variance = S ** 2 / n_total_samples
240-
explained_variance_ratio = S ** 2 / np.sum(col_var *
241-
n_total_samples)
242-
self.n_samples_seen_ += n_samples
240+
mean_correction = \
241+
np.sqrt((self.n_samples_seen_ * n_samples) /
242+
n_total_samples) * (self.mean_ - col_batch_mean)
243+
X = np.vstack((self.singular_values_.reshape((-1, 1)) *
244+
self.components_, X, mean_correction))
245+
246+
U, S, V = linalg.svd(X, full_matrices=False)
247+
U, V = svd_flip(U, V, u_based_decision=False)
248+
explained_variance = S ** 2 / n_total_samples
249+
explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples)
250+
251+
self.n_samples_seen_ = n_total_samples
243252
self.components_ = V[:self.n_components_]
244253
self.singular_values_ = S[:self.n_components_]
245254
self.mean_ = col_mean

0 commit comments

Comments
 (0)