diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index f5b548a5278cd..2addb67560313 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -189,7 +189,7 @@ def build_preprocessor(self): # hundreds of nanoseconds which is negligible when compared to the # cost of tokenizing a string of 1000 chars for instance. noop = lambda x: x - + # accent stripping if not self.strip_accents: strip_accents = noop @@ -996,6 +996,12 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): sublinear_tf : boolean, default=False Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). + Attributes + ---------- + idf_ : numpy array of shape [n_features, 1] + returns None unless use_idf=True, then + returns 1-D matrix containing idf(d,t). + References ---------- @@ -1035,9 +1041,8 @@ def fit(self, X, y=None): # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_samples) / df) + 1.0 - self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, + self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') - return self def transform(self, X, copy=True):