-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
feature names - NamedArray #14315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
feature names - NamedArray #14315
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
b6f3a34
clean start
adrinjalali dcf8dac
fix init
adrinjalali 348db13
add __getattr__
adrinjalali 68048f5
revert validation changes
adrinjalali e9fe981
fix py35
adrinjalali da0b186
support numpy 1.11
adrinjalali 47272a6
adding docstring
adrinjalali 7c01846
add pandas, not supporting numpy<1.13
adrinjalali dcfa67e
add more tests and implement most of the comments
adrinjalali 5b47f06
implement __dir__
adrinjalali f4ee09f
Merge remote-tracking branch 'upstream/master' into xarray
adrinjalali 42bdf9f
adding sparse named array
adrinjalali b3eaba9
merge upstream/master
adrinjalali 293dbba
fix typo and remove merge note from __init__
adrinjalali a498e4d
understand pandas feature names
adrinjalali 3d70e72
fix the sparse case
adrinjalali d122adb
add force_sparse and sparse from df
adrinjalali 169cbb3
add sparse types to init
adrinjalali a6fbfbd
pep8
adrinjalali File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
# -*- coding: utf-8 -*- | ||
# Authors: Adrin Jalali <adrin.jalali@gmail.com> | ||
# | ||
# License: BSD 3 clause | ||
|
||
import numpy as np | ||
import scipy as sp | ||
|
||
# NDArrayOperatorsMixin was added in numpy 1.13 | ||
# TODO: cleanup once we support numpy 1.13+ | ||
try: | ||
from numpy.lib.mixins import NDArrayOperatorsMixin | ||
except ImportError: | ||
raise NotImplementedError("In order to use NamedAraay, please upgrade your" | ||
" numpy to 1.13+!") | ||
|
||
from .validation import check_array, column_or_1d | ||
|
||
|
||
class FeatureNamesMixin: | ||
@property | ||
def feature_names(self): | ||
return self._feature_names | ||
|
||
@feature_names.setter | ||
def feature_names(self, value): | ||
if value is None: | ||
self._feature_names = None | ||
return | ||
|
||
if np.isscalar(value): | ||
value = [value] | ||
value = column_or_1d(value) | ||
col_count = self._col_count(self._data) | ||
if len(value) != col_count: | ||
raise ValueError("{} column names provided, but data has {} " | ||
"columns".format(len(value), col_count)) | ||
|
||
self._feature_names = value | ||
|
||
def _col_count(self, value): | ||
if value.ndim == 1: | ||
return 1 | ||
else: | ||
return value.shape[1] | ||
|
||
|
||
class NamedArray(FeatureNamesMixin, NDArrayOperatorsMixin): | ||
"""A wrapper to a numpy ndarray holding some metadata about the data. | ||
|
||
Instances of this object behave like a numpy array, and lose all metadata | ||
information in numerical operations. | ||
|
||
Parameters | ||
---------- | ||
data: array-like | ||
A one or two dimensional array like data. | ||
|
||
feature_names: list or array of strings, or None, default=None | ||
Feature names associated with the columns of the data. The number of | ||
columns should always be the same as the number of feature names. | ||
Setting the `data` of an instance, would result in `feature_names` to | ||
be `None` if the number of columns do not match the number of stored | ||
feature names. | ||
""" | ||
|
||
def __init__(self, data, feature_names=None): | ||
if hasattr(data, 'columns') and feature_names is None: | ||
feature_names = list(data.columns) | ||
data = check_array(data, ensure_2d=False) | ||
self._data = data | ||
self.feature_names = feature_names | ||
|
||
def __getattr__(self, name): | ||
return getattr(self._data, name) | ||
|
||
def __dir__(self): | ||
return list(set(dir(NamedArray)).union(set(dir(self._data)))) | ||
|
||
def __getitem__(self, slice): | ||
return self._data[slice] | ||
|
||
def __repr__(self): | ||
prefix = self.__class__.__name__ + '(' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not tested |
||
base_repr = np.array2string(self._data, | ||
prefix=prefix) | ||
return (prefix + base_repr | ||
+ ',\n feature_names={})'.format( | ||
str(self.feature_names))) | ||
|
||
def todataframe(self): | ||
"""Returns a `pandas.DataFrame` with set column names.""" | ||
import pandas as pd | ||
return pd.DataFrame(self._data, columns=self.feature_names) | ||
|
||
|
||
class SparseNamedArrayMixin(FeatureNamesMixin): | ||
def __init__(self, *args, feature_names=None, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self._data = self | ||
self.feature_names = feature_names | ||
|
||
def __repr__(self): | ||
res = super().__repr__() | ||
res += "\nfeature names: %s" % repr(self._feature_names) | ||
return res | ||
|
||
def todataframe(self): | ||
"""Returns a `pandas.DataFrame` with set column names.""" | ||
import pandas as pd | ||
return pd.DataFrame.sparse.from_spmatrix(self, | ||
columns=self.feature_names) | ||
|
||
|
||
# We need a class per sparse matrix type, hence the following 7 classes. | ||
class SparseNamedArrayCSR(SparseNamedArrayMixin, sp.sparse.csr_matrix): | ||
pass | ||
|
||
|
||
class SparseNamedArrayCSC(SparseNamedArrayMixin, sp.sparse.csc_matrix): | ||
pass | ||
|
||
|
||
class SparseNamedArrayBSR(SparseNamedArrayMixin, sp.sparse.bsr_matrix): | ||
pass | ||
|
||
|
||
class SparseNamedArrayLIL(SparseNamedArrayMixin, sp.sparse.lil_matrix): | ||
pass | ||
|
||
|
||
class SparseNamedArrayDOK(SparseNamedArrayMixin, sp.sparse.dok_matrix): | ||
pass | ||
|
||
|
||
class SparseNamedArrayDIA(SparseNamedArrayMixin, sp.sparse.dia_matrix): | ||
pass | ||
|
||
|
||
class SparseNamedArrayCOO(SparseNamedArrayMixin, sp.sparse.coo_matrix): | ||
pass | ||
|
||
|
||
def make_namedarray(X, feature_names=None, force_sparse=None): | ||
types = {'csr': SparseNamedArrayCSR, | ||
'csc': SparseNamedArrayCSC, | ||
'bsr': SparseNamedArrayBSR, | ||
'lil': SparseNamedArrayLIL, | ||
'dok': SparseNamedArrayDOK, | ||
'dia': SparseNamedArrayDIA, | ||
'coo': SparseNamedArrayCOO} | ||
if hasattr(X, 'columns') and feature_names is None: | ||
feature_names = list(X.columns) | ||
|
||
format = None | ||
if sp.sparse.issparse(X): | ||
format = X.format | ||
elif force_sparse: | ||
format = force_sparse | ||
|
||
if format: | ||
return types[format](X, feature_names=feature_names, copy=False) | ||
else: | ||
return NamedArray(X, feature_names=feature_names) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import pytest | ||
import numpy as np | ||
|
||
from sklearn.utils.testing import assert_array_equal | ||
from sklearn.utils import NamedArray | ||
|
||
|
||
def test_basics(): | ||
x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) | ||
assert_array_equal(x.feature_names, ['a', 'b', 'c']) | ||
assert not isinstance(x + 1, NamedArray) | ||
assert not isinstance(x + x, NamedArray) | ||
assert not isinstance(x + np.ones(shape=(5, 3)), NamedArray) | ||
assert x[0, :].shape == (3,) | ||
assert x[:, 0].shape == (5,) | ||
assert x[0:2].shape == (2, 3) | ||
|
||
|
||
def test_validation(): | ||
with pytest.raises(ValueError, match="column names provided"): | ||
NamedArray(np.ones(shape=(3, 3)), feature_names=[1]) | ||
|
||
# allow None as feature_names | ||
NamedArray(np.ones(shape=(3, 3))) | ||
|
||
|
||
def test_getattr(): | ||
x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) | ||
# these would fail if __getattr__ doesn't work | ||
x.ndim | ||
x.shape | ||
|
||
|
||
def test_pandas(): | ||
_ = pytest.importorskip("pandas") | ||
x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) | ||
assert all(x.todataframe().columns == ['a', 'b', 'c']) | ||
|
||
|
||
def test_1d(): | ||
x = NamedArray(np.array([1, 2]), feature_names=['a']) | ||
assert x.feature_names == ['a'] | ||
|
||
|
||
def test_repr(): | ||
repr_ = ("NamedArray([[1 2]\n" | ||
" [3 4]],\n" | ||
" feature_names=['a' 'b'])") | ||
x = NamedArray([[1, 2], [3, 4]], feature_names=['a', 'b']) | ||
assert repr(x) == repr_ | ||
|
||
|
||
def test_numpy_attrs(): | ||
a = np.ones(shape=(1)) | ||
x = NamedArray(a, feature_names='a') | ||
assert set(dir(a)) < set(dir(x)) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not tested