1
1
"""Incremental Principal Components Analysis."""
2
2
3
3
# Author: Kyle Kastner <kastnerkyle@gmail.com>
4
+ # Giorgio Patrini
4
5
# License: BSD 3 clause
5
6
6
7
import numpy as np
7
8
from scipy import linalg
8
9
9
10
from .base import _BasePCA
10
11
from ..utils import check_array , gen_batches
11
- from ..utils .extmath import svd_flip , _batch_mean_variance_update
12
+ from ..utils .extmath import svd_flip , _incremental_mean_and_var
12
13
13
14
14
15
class IncrementalPCA (_BasePCA ):
@@ -76,7 +77,8 @@ class IncrementalPCA(_BasePCA):
76
77
Per-feature empirical mean, aggregate over calls to ``partial_fit``.
77
78
78
79
var_ : array, shape (n_features,)
79
- Per-feature empirical variance, aggregate over calls to ``partial_fit``.
80
+ Per-feature empirical variance, aggregate over calls to
81
+ ``partial_fit``.
80
82
81
83
noise_variance_ : float
82
84
The estimated noise covariance following the Probabilistic PCA model
@@ -85,7 +87,8 @@ class IncrementalPCA(_BasePCA):
85
87
http://www.miketipping.com/papers/met-mppca.pdf.
86
88
87
89
n_components_ : int
88
- The estimated number of components. Relevant when ``n_components=None``.
90
+ The estimated number of components. Relevant when
91
+ ``n_components=None``.
89
92
90
93
n_samples_seen_ : int
91
94
The number of samples processed by the estimator. Will be reset on
@@ -157,14 +160,15 @@ def fit(self, X, y=None):
157
160
Returns the instance itself.
158
161
"""
159
162
self .components_ = None
160
- self .mean_ = None
163
+ self .n_samples_seen_ = 0
164
+ self .mean_ = .0
165
+ self .var_ = .0
161
166
self .singular_values_ = None
162
167
self .explained_variance_ = None
163
168
self .explained_variance_ratio_ = None
164
169
self .noise_variance_ = None
165
- self .var_ = None
166
- self .n_samples_seen_ = 0
167
- X = check_array (X , dtype = np .float )
170
+
171
+ X = check_array (X , copy = self .copy , dtype = [np .float64 , np .float32 ])
168
172
n_samples , n_features = X .shape
169
173
170
174
if self .batch_size is None :
@@ -173,10 +177,11 @@ def fit(self, X, y=None):
173
177
self .batch_size_ = self .batch_size
174
178
175
179
for batch in gen_batches (n_samples , self .batch_size_ ):
176
- self .partial_fit (X [batch ])
180
+ self .partial_fit (X [batch ], check_input = False )
181
+
177
182
return self
178
183
179
- def partial_fit (self , X , y = None ):
184
+ def partial_fit (self , X , y = None , check_input = True ):
180
185
"""Incremental fit with X. All of X is processed as a single batch.
181
186
182
187
Parameters
@@ -190,7 +195,8 @@ def partial_fit(self, X, y=None):
190
195
self: object
191
196
Returns the instance itself.
192
197
"""
193
- X = check_array (X , copy = self .copy , dtype = np .float )
198
+ if check_input :
199
+ X = check_array (X , copy = self .copy , dtype = [np .float64 , np .float32 ])
194
200
n_samples , n_features = X .shape
195
201
if not hasattr (self , 'components_' ):
196
202
self .components_ = None
@@ -204,42 +210,45 @@ def partial_fit(self, X, y=None):
204
210
else :
205
211
self .n_components_ = self .n_components
206
212
207
- if (self .components_ is not None ) and (self .components_ .shape [0 ]
208
- != self .n_components_ ):
213
+ if (self .components_ is not None ) and (self .components_ .shape [0 ] !=
214
+ self .n_components_ ):
209
215
raise ValueError ("Number of input features has changed from %i "
210
216
"to %i between calls to partial_fit! Try "
211
- "setting n_components to a fixed value." % (
212
- self .components_ .shape [0 ], self .n_components_ ))
217
+ "setting n_components to a fixed value." %
218
+ ( self .components_ .shape [0 ], self .n_components_ ))
213
219
214
- if self . components_ is None :
215
- # This is the first pass through partial_fit
220
+ # This is the first partial_fit
221
+ if not hasattr ( self , 'n_samples_seen_' ):
216
222
self .n_samples_seen_ = 0
217
- col_var = X .var (axis = 0 )
218
- col_mean = X .mean (axis = 0 )
223
+ self .mean_ = .0
224
+ self .var_ = .0
225
+
226
+ # Update stats - they are 0 if this is the fisrt step
227
+ col_mean , col_var , n_total_samples = \
228
+ _incremental_mean_and_var (X , last_mean = self .mean_ ,
229
+ last_variance = self .var_ ,
230
+ last_sample_count = self .n_samples_seen_ )
231
+
232
+ # Whitening
233
+ if self .n_samples_seen_ == 0 :
234
+ # If it is the first step, simply whiten X
219
235
X -= col_mean
220
- U , S , V = linalg .svd (X , full_matrices = False )
221
- U , V = svd_flip (U , V , u_based_decision = False )
222
- explained_variance = S ** 2 / n_samples
223
- explained_variance_ratio = S ** 2 / np .sum (col_var *
224
- n_samples )
225
236
else :
226
- col_batch_mean = X .mean (axis = 0 )
227
- col_mean , col_var , n_total_samples = _batch_mean_variance_update (
228
- X , self .mean_ , self .var_ , self .n_samples_seen_ )
237
+ col_batch_mean = np .mean (X , axis = 0 )
229
238
X -= col_batch_mean
230
239
# Build matrix of combined previous basis and new data
231
- mean_correction = np . sqrt (( self . n_samples_seen_ * n_samples ) /
232
- n_total_samples ) * ( self .mean_ -
233
- col_batch_mean )
234
- X_combined = np .vstack ((self .singular_values_ .reshape ((- 1 , 1 )) *
235
- self .components_ , X ,
236
- mean_correction ))
237
- U , S , V = linalg .svd (X_combined , full_matrices = False )
238
- U , V = svd_flip (U , V , u_based_decision = False )
239
- explained_variance = S ** 2 / n_total_samples
240
- explained_variance_ratio = S ** 2 / np .sum (col_var *
241
- n_total_samples )
242
- self .n_samples_seen_ += n_samples
240
+ mean_correction = \
241
+ np . sqrt (( self .n_samples_seen_ * n_samples ) /
242
+ n_total_samples ) * ( self . mean_ - col_batch_mean )
243
+ X = np .vstack ((self .singular_values_ .reshape ((- 1 , 1 )) *
244
+ self .components_ , X , mean_correction ))
245
+
246
+ U , S , V = linalg .svd (X , full_matrices = False )
247
+ U , V = svd_flip (U , V , u_based_decision = False )
248
+ explained_variance = S ** 2 / n_total_samples
249
+ explained_variance_ratio = S ** 2 / np .sum (col_var * n_total_samples )
250
+
251
+ self .n_samples_seen_ = n_total_samples
243
252
self .components_ = V [:self .n_components_ ]
244
253
self .singular_values_ = S [:self .n_components_ ]
245
254
self .mean_ = col_mean
0 commit comments