Skip to content

[MRG] Add get_feature_names to PCA #6445

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions sklearn/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,80 @@ def __init__(self, n_components=None, copy=True, whiten=False,
self.iterated_power = iterated_power
self.random_state = random_state

def get_feature_names(self, input_features=None, show_coef=False):
"""
Return dominant feature names for each component

Parameters
----------
input_features : list of string, shape (n_features), optional
String names for input features if available. By default,
"x0", "x1", ... "xn_features" is used.

show_coef : boolean or integer, default False
When it is "True", return the principal components as the
combination of the input features. If "False", will be just
the component names. If it is an integer n, it returns the
sorted top n contributions to each component.


Returns
-------
output_feature_names : list of string, shape (n_components)
When show_coef is "True", it is represented by the contribution
of input features and show_coef is "False", it just represents
the component names

Examples
--------
>>> import numpy as np
>>> from sklearn.decomposition import PCA
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> pca = PCA(n_components=2).fit(X)
>>> pca.get_feature_names(show_coef=True)
['-0.84*x0 - 0.54*x1', '0.54*x0 - 0.84*x1']
>>> pca.get_feature_names(show_coef=1)
['-0.84*x0', '-0.84*x1']
>>> pca.get_feature_names()
['pc0', 'pc1']
"""
check_is_fitted(self, 'components_')

n_features = self.components_.shape[1]
components = self.components_

if input_features is None:
input_features = ['x%d' % i for i in range(n_features)]
else:
if len(input_features) != n_features:
raise ValueError("Length of input_features is {0} but it must"
"equal number of features when fitted: {1}.".format
(len(input_features), n_features))

def name_generator(coefficients, names):
yield "{0:.2g}*{1}".format(coefficients[0], names[0])
for c, n in zip(coefficients[1:], names[1:]):
yield "{0:s} {1:.2g}*{2}".format('-' if c < 0 else '+', abs(c), n)

if show_coef is True:
feature_names = [' '.join(name_generator(components[i],input_features))
for i in range(self.n_components)]
elif show_coef is False:
feature_names = ['pc{0}'.format(i) for i in range(self.n_components)]
elif isinstance(show_coef, six.integer_types):
if show_coef < 0 or show_coef > n_features:
raise ValueError("show_coef is {0} but it must be between 1 and"
"number of features {1}".format(show_coef, n_features))
contribution = np.argsort(np.abs(components), axis=1)[:, ::-1]
required = contribution[:,:show_coef]
input_features = np.asarray(input_features)
feature_names = [' '.join(name_generator(components[i][required[i]], input_features[required[i]]))
for i in range(self.n_components)]
else:
raise ValueError("show_coef must be integer or boolean")
return feature_names


def fit(self, X, y=None):
"""Fit the model with X.

Expand Down
16 changes: 16 additions & 0 deletions sklearn/decomposition/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import assert_less
from sklearn.utils.testing import assert_array_equal

from sklearn import datasets
from sklearn.decomposition import PCA
Expand Down Expand Up @@ -507,3 +508,18 @@ def fit_deprecated(X):
assert_warns_message(DeprecationWarning, depr_message, fit_deprecated, X)
Y_pca = PCA(svd_solver='randomized', random_state=0).fit_transform(X)
assert_array_almost_equal(Y, Y_pca)


def test_get_feature_names():
X1 = np.array([[-1, -1, 3], [-2, -1, 1], [-3, -2, -1], [1, 1, 2]])
pca = PCA(n_components=2).fit(X1)
assert_array_equal(pca.get_feature_names(), ['pc0', 'pc1'])
assert_array_equal(pca.get_feature_names(show_coef=True),
['-0.66*x0 - 0.46*x1 - 0.59*x2', '-0.38*x0 - 0.47*x1 + 0.79*x2'])
assert_array_equal(pca.get_feature_names(show_coef=1),
['-0.66*x0', '0.79*x2'])
# Raise error when len(input_features) != n_features
assert_raises(ValueError, pca.get_feature_names, ['a']);
# Raise error when show_coef is greater than n_features
assert_raises(ValueError, pca.get_feature_names, None, 4)