scikit-learn · MechCoder · Oct 11, 2015 · Mar 27, 2014 · Jul 27, 2015 · Sep 8, 2015
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -463,6 +463,7 @@ From text
    feature_selection.SelectKBest
    feature_selection.SelectFpr
    feature_selection.SelectFdr
+   feature_selection.SelectFromModel
    feature_selection.SelectFwe
    feature_selection.RFE
    feature_selection.RFECV

diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
@@ -131,33 +131,52 @@ number of features.
       elimination example with automatic tuning of the number of features
       selected with cross-validation.
 
+.. _select_from_model:
 
-.. _l1_feature_selection:
+Feature selection using SelectFromModel
+=======================================
+
+:class:`SelectFromModel` is a meta-transformer that can be used along with any
+estimator that has a ``coef_`` or ``feature_importances_`` attribute after fitting.
+The features are considered unimportant and removed, if the corresponding
+``coef_`` or ``feature_importances_`` values are below the provided
+``threshold`` parameter. Apart from specifying the threshold numerically,
+there are build-in heuristics for finding a threshold using a string argument.
+Available heuristics are "mean", "median" and float multiples of these like
+"0.1*mean".
+
+For examples on how it is to be used refer to the sections below.
+
+.. topic:: Examples
+
+    * :ref:`example_feature_selection_plot_select_from_model_boston.py`: Selecting the two
+      most important features from the Boston dataset without knowing the
+      threshold beforehand.
 
 L1-based feature selection
-==========================
+--------------------------
 
 .. currentmodule:: sklearn
 
-Selecting non-zero coefficients
----------------------------------
-
 :ref:`Linear models <linear_model>` penalized with the L1 norm have
 sparse solutions: many of their estimated coefficients are zero. When the goal
 is to reduce the dimensionality of the data to use with another classifier,
-they expose a ``transform`` method to select the non-zero coefficient. In
-particular, sparse estimators useful for this purpose are the
-:class:`linear_model.Lasso` for regression, and
+they can be used along with :class:`feature_selection.SelectFromModel`
+to select the non-zero coefficients. In particular, sparse estimators useful for
+this purpose are the :class:`linear_model.Lasso` for regression, and
 of :class:`linear_model.LogisticRegression` and :class:`svm.LinearSVC`
 for classification::
 
   >>> from sklearn.svm import LinearSVC
   >>> from sklearn.datasets import load_iris
+  >>> from sklearn.feature_selection import SelectFromModel
   >>> iris = load_iris()
   >>> X, y = iris.data, iris.target
   >>> X.shape
   (150, 4)
-  >>> X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, y)
+  >>> lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
+  >>> model = SelectFromModel(lsvc, prefit=True)
+  >>> X_new = model.transform(X)
   >>> X_new.shape
   (150, 3)
 
@@ -241,23 +260,27 @@ of features non zero.
      http://hal.inria.fr/hal-00354771/
 
 Tree-based feature selection
-============================
+----------------------------
 
 Tree-based estimators (see the :mod:`sklearn.tree` module and forest
 of trees in the :mod:`sklearn.ensemble` module) can be used to compute
 feature importances, which in turn can be used to discard irrelevant
-features::
+features (when coupled with the :class:`sklearn.feature_selection.SelectFromModel`
+meta-transformer)::
 
   >>> from sklearn.ensemble import ExtraTreesClassifier
   >>> from sklearn.datasets import load_iris
+  >>> from sklearn.feature_selection import SelectFromModel
   >>> iris = load_iris()
   >>> X, y = iris.data, iris.target
   >>> X.shape
   (150, 4)
   >>> clf = ExtraTreesClassifier()
-  >>> X_new = clf.fit(X, y).transform(X)
+  >>> clf = clf.fit(X, y)
   >>> clf.feature_importances_  # doctest: +SKIP
   array([ 0.04...,  0.05...,  0.4...,  0.4...])
+  >>> model = SelectFromModel(clf, prefit=True)
+  >>> X_new = model.transform(X)
   >>> X_new.shape               # doctest: +SKIP
   (150, 2)
 
@@ -278,12 +301,13 @@ the actual learning. The recommended way to do this in scikit-learn is
 to use a :class:`sklearn.pipeline.Pipeline`::
 
   clf = Pipeline([
-    ('feature_selection', LinearSVC(penalty="l1")),
+    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
     ('classification', RandomForestClassifier())
   ])
   clf.fit(X, y)
 
 In this snippet we make use of a :class:`sklearn.svm.LinearSVC`
+coupled with :class:`sklearn.feature_selection.SelectFromModel`
 to evaluate feature importances and select the most relevant features.
 Then, a :class:`sklearn.ensemble.RandomForestClassifier` is trained on the
 transformed output, i.e. using only relevant features. You can perform

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -207,6 +207,11 @@ Enhancements
      the same. This allows gradient boosters to turn off presorting when building
      deep trees or using sparse data. By `Jacob Schreiber`_.
 
+   - Added :class:`feature_selection.SelectFromModel` meta-transformer which can
+     be used along with estimators that have `coef_` or `feature_importances_`
+     attribute to select important features of the input data. By
+     `Maheshakya Wijewardena`_, `Joel Nothman`_ and `Manoj Kumar`_.
+
 Bug fixes
 .........
 
@@ -269,6 +274,13 @@ API changes summary
       fit method to the constructor in
       :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`.
 
+    - Models inheriting from ``_LearntSelectorMixin`` will no longer support the
+      transform methods. (i.e,  RandomForests, GradientBoosting, LogisticRegression,
+      DecisionTrees, SVMs and SGD related models). Wrap these models around the
+      metatransfomer :class:`feature_selection.SelectFromModel` to remove
+      features (according to `coefs_` or `feature_importances_`)
+      which are below a certain threshold value instead.
+
 .. _changes_0_1_16:
 
 Version 0.16.1

diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
@@ -34,6 +34,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                               GradientBoostingClassifier)
+from sklearn.feature_selection import SelectFromModel
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.cross_validation import train_test_split
 from sklearn.metrics import roc_curve
@@ -53,12 +54,12 @@
 rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
 rt_lm = LogisticRegression()
 rt.fit(X_train, y_train)
-rt_lm.fit(rt.transform(X_train_lr), y_train_lr)
+rt_lm.fit(SelectFromModel(rt, prefit=True).transform(X_train_lr), y_train_lr)
 
-y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
+y_pred_rt = rt_lm.predict_proba(
+	SelectFromModel(rt, prefit=True).transform(X_test))[:, 1]
 fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
 
-
 # Supervised transformation based on random forests
 rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
 rf_enc = OneHotEncoder()

diff --git a/examples/ensemble/plot_random_forest_embedding.py b/examples/ensemble/plot_random_forest_embedding.py
@@ -30,14 +30,17 @@
 from sklearn.datasets import make_circles
 from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
 from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_selection import SelectFromModel
 from sklearn.naive_bayes import BernoulliNB
 
 # make a synthetic dataset
 X, y = make_circles(factor=0.5, random_state=0, noise=0.05)
 
 # use RandomTreesEmbedding to transform data
 hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
-X_transformed = hasher.fit_transform(X)
+hasher.fit(X)
+model = SelectFromModel(hasher, prefit=True)
+X_transformed = model.transform(X)
 
 # Visualize result using PCA
 pca = TruncatedSVD(n_components=2)

diff --git a/examples/feature_selection/plot_select_from_model_boston.py b/examples/feature_selection/plot_select_from_model_boston.py
@@ -0,0 +1,51 @@
+"""
+===================================================
+Feature selection using SelectFromModel and LassoCV
+===================================================
+
+Use SelectFromModel meta-transformer along with Lasso to select the best
+couple of features from the Boston dataset.
+"""
+# Author: Manoj Kumar <mks542@nyu.edu>
+# License: BSD 3 clause
+
+print(__doc__)
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import load_boston
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import LassoCV
+
+# Load the boston dataset.
+boston = load_boston()
+X, y = boston['data'], boston['target']
+
+# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
+clf = LassoCV()
+
+# Set a minimum threshold of 0.25
+sfm = SelectFromModel(clf, threshold=0.25)
+sfm.fit(X, y)
+n_features = sfm.transform(X).shape[1]
+
+# Reset the threshold till the number of features equals two.
+# Note that the attribute can be set directly instead of repeatedly
+# fitting the metatransformer.
+while n_features > 2:
+    sfm.threshold += 0.1
+    X_transform = sfm.transform(X)
+    n_features = X_transform.shape[1]
+
+# Plot the selected two features from X.
+plt.title(
+    "Features selected from Boston using SelectFromModel with "
+    "threshold %0.3f." % sfm.threshold)
+feature1 = X_transform[:, 0]
+feature2 = X_transform[:, 1] 
+plt.plot(feature1, feature2, 'r.')
+plt.xlabel("Feature number 1")
+plt.ylabel("Feature number 2")
+plt.ylim([np.min(feature2), np.max(feature2)])
+plt.show()
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -19,6 +19,7 @@
 from scipy.sparse import csc_matrix
 from scipy.sparse import coo_matrix
 
+from sklearn.utils import warnings
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
@@ -194,15 +195,19 @@ def test_probability():
 def check_importances(X, y, name, criterion):
     ForestEstimator = FOREST_ESTIMATORS[name]
 
-    est = ForestEstimator(n_estimators=20, criterion=criterion,random_state=0)
+    est = ForestEstimator(n_estimators=20, criterion=criterion,
+                          random_state=0)
     est.fit(X, y)
     importances = est.feature_importances_
     n_important = np.sum(importances > 0.1)
     assert_equal(importances.shape[0], 10)
     assert_equal(n_important, 3)
 
-    X_new = est.transform(X, threshold="mean")
-    assert_less(X_new.shape[1], X.shape[1])
+    # XXX: Remove this test in 0.19 after transform support to estimators
+    # is removed.
+    X_new = assert_warns(
+        DeprecationWarning, est.transform, X, threshold="mean")
+    assert_less(0 < X_new.shape[1], X.shape[1])
 
     # Check with parallel
     importances = est.feature_importances_

diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -26,6 +26,7 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns
+from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.validation import DataConversionWarning
 from sklearn.utils.validation import NotFittedError
 
@@ -296,10 +297,13 @@ def test_feature_importances():
         clf.fit(X, y)
         assert_true(hasattr(clf, 'feature_importances_'))
 
-        X_new = clf.transform(X, threshold="mean")
+        # XXX: Remove this test in 0.19 after transform support to estimators
+        # is removed.
+        X_new = assert_warns(
+            DeprecationWarning, clf.transform, X, threshold="mean")
         assert_less(X_new.shape[1], X.shape[1])
-
-        feature_mask = clf.feature_importances_ > clf.feature_importances_.mean()
+        feature_mask = (
+            clf.feature_importances_ > clf.feature_importances_.mean())
         assert_array_almost_equal(X_new, X[:, feature_mask])
 
 

diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
@@ -20,6 +20,8 @@
 from .rfe import RFE
 from .rfe import RFECV
 
+from .from_model import SelectFromModel
+
 __all__ = ['GenericUnivariateSelect',
            'RFE',
            'RFECV',
@@ -32,4 +34,5 @@
            'chi2',
            'f_classif',
            'f_oneway',
-           'f_regression']
+           'f_regression',
+           'SelectFromModel']
diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py
@@ -81,7 +81,7 @@ def transform(self, X):
             return np.empty(0).reshape((X.shape[0], 0))
         if len(mask) != X.shape[1]:
             raise ValueError("X has a different shape than during fitting.")
-        return check_array(X, accept_sparse='csr')[:, safe_mask(X, mask)]
+        return X[:, safe_mask(X, mask)]
 
     def inverse_transform(self, X):
         """