From b6f3a348ec8c545fac8e0f100303ed26c859b5d7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 15:52:12 +0200 Subject: [PATCH 01/17] clean start --- sklearn/utils/_namedarray.py | 69 ++++++++++++++++++++++++++ sklearn/utils/tests/test_namedarray.py | 27 ++++++++++ sklearn/utils/validation.py | 54 +++++++++++++++++--- 3 files changed, 144 insertions(+), 6 deletions(-) create mode 100644 sklearn/utils/_namedarray.py create mode 100644 sklearn/utils/tests/test_namedarray.py diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py new file mode 100644 index 0000000000000..de910b7e67617 --- /dev/null +++ b/sklearn/utils/_namedarray.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# Authors: Adrin Jalali +# +# License: BSD 3 clause + +import numpy as np +from numpy.lib.mixins import NDArrayOperatorsMixin +from .validation import check_array, column_or_1d + + +class NamedArray(NDArrayOperatorsMixin): + _feature_names = None + _data = None + + def __init__(self, data, feature_names=None): + self.data = data + self.feature_names = feature_names + + @property + def data(self): + return self._data + + @data.setter + def data(self, value): + value = check_array(value) + self._data = value + + if self.feature_names is None: + return + + if len(self.feature_names) != self._col_count(value): + self._feature_names = None + + @property + def feature_names(self): + return self._feature_names + + @feature_names.setter + def feature_names(self, value): + if value is None: + self._feature_names = None + return + + value = column_or_1d(value) + col_count = self._col_count(self.data) + if len(value) != col_count: + raise ValueError("{} column names provided, but data has {} " + "columns".format(len(value), col_count)) + + self._feature_names = value + + def _col_count(self, value): + if value.ndim == 1: + return 1 + else: + return value.shape[1] + + def __getitem__(self, slice): + return self.data[slice] + + def __array__(self, *args, **kwargs): + return self.data.__array__(*args, **kwargs) + + def __repr__(self): + prefix = self.__class__.__name__ + '(' + base_repr = np.array2string(self.data, + prefix=prefix) + return (prefix + base_repr + + f',\n feature_names={self.feature_names!r})') diff --git a/sklearn/utils/tests/test_namedarray.py b/sklearn/utils/tests/test_namedarray.py new file mode 100644 index 0000000000000..a317bc5e64c8f --- /dev/null +++ b/sklearn/utils/tests/test_namedarray.py @@ -0,0 +1,27 @@ +import pytest +import numpy as np + +from sklearn.utils.testing import assert_array_equal +from sklearn.utils import NamedArray + + +def test_basics(): + x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) + assert_array_equal(x.feature_names, ['a', 'b', 'c']) + assert not isinstance(x + 1, NamedArray) + assert not isinstance(x + x, NamedArray) + assert not isinstance(x + np.ones(shape=(5, 3)), NamedArray) + + +def test_validation(): + with pytest.raises(ValueError, match="column names provided"): + NamedArray(np.ones(shape=(3, 3)), feature_names=[1]) + + # allow None as feature_names + NamedArray(np.ones(shape=(3, 3))) + + x = NamedArray(np.ones(shape=(3, 3)), feature_names=[1, 2, 3]) + x.data = np.ones(shape=(4, 3)) + assert x.feature_names is not None + x.data = np.ones(shape=(4, 4)) + assert x.feature_names is None diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 390f537be95a5..d0fcf1c0310e2 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -15,7 +15,6 @@ import scipy.sparse as sp from distutils.version import LooseVersion from inspect import signature - from numpy.core.numeric import ComplexWarning import joblib @@ -316,7 +315,7 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, if force_all_finite: if not hasattr(spmatrix, "data"): warnings.warn("Can't check %s sparse matrix for nan or inf." - % spmatrix.format) + % spmatrix.format, stacklevel=2) else: _assert_all_finite(spmatrix.data, allow_nan=force_all_finite == 'allow-nan') @@ -431,7 +430,11 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, "'warn_on_dtype' is deprecated in version 0.21 and will be " "removed in 0.23. Don't set `warn_on_dtype` to remove this " "warning.", - DeprecationWarning) + DeprecationWarning, stacklevel=2) + + # duck-typing to avoid a circular import + if hasattr(array, "_data") and hasattr(array, "_feature_names"): + array = array._data # store reference to original array to check if copy is needed when # function returns @@ -531,7 +534,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, "a float dtype before using it in scikit-learn, " "for example by using " "your_array = your_array.astype(np.float64).", - FutureWarning) + FutureWarning, stacklevel=2) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": @@ -562,7 +565,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig: msg = ("Data with input dtype %s was converted to %s%s." % (dtype_orig, array.dtype, context)) - warnings.warn(msg, DataConversionWarning) + warnings.warn(msg, DataConversionWarning, stacklevel=2) if copy and np.may_share_memory(array, array_orig): array = np.array(array, dtype=dtype, order=order) @@ -855,7 +858,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, raise ValueError("Array must be symmetric") if raise_warning: warnings.warn("Array is not symmetric, and will be converted " - "to symmetric by average with its transpose.") + "to symmetric by average with its transpose.", + stacklevel=2) if sp.issparse(array): conversion = 'to' + array.format array = getattr(0.5 * (array + array.T), conversion)() @@ -985,6 +989,44 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) +def _feature_names(X): + if isinstance(X, NamedArray): + return X.features + elif hasattr(X, 'columns'): + return X.columns + else: + return None + + +def _check_NamedArray(X, feature_names): + # need to ignore feature names on a bunch of input types + # not sure how to handle this. + if feature_names is None and ( + isinstance(X, list) or + isinstance(X, tuple) or + isinstance(X, dict)): + print("returning input") + return X + + if feature_names is None: + feature_names = ['x%s' % i for i in range(X.shape[1])] + + if isinstance(X, NamedArray): + X.features = feature_names + else: + X = NamedArray(X, feature_names) + + return X + + +def _check_feature_names(X, fitted_estimator): + if not hasattr(fitted_estimator, 'feature_names_out_'): + return X + + return _check_NamedArray( + X, fitted_estimator.feature_names_out_) + + def _check_sample_weight(sample_weight, X, dtype=None): """Validate sample weights. From dcf8dac33d8634577136b5c8d469dd40bcfd38ba Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 15:53:25 +0200 Subject: [PATCH 02/17] fix init --- sklearn/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index efcaf6865faa5..4987d3da8a641 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -23,6 +23,7 @@ check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, check_symmetric, check_scalar) +from ._namedarray import NamedArray from .. import get_config @@ -67,7 +68,7 @@ class Parallel(_joblib.Parallel): "check_symmetric", "indices_to_mask", "deprecated", "cpu_count", "Parallel", "Memory", "delayed", "parallel_backend", "register_parallel_backend", "hash", "effective_n_jobs", - "resample", "shuffle", "check_matplotlib_support"] + "resample", "shuffle", "check_matplotlib_support", "NamedArray"] IS_PYPY = platform.python_implementation() == 'PyPy' _IS_32BIT = 8 * struct.calcsize("P") == 32 From 348db13e4ad6fdcd862d18976d87b40009ae9845 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 15:59:23 +0200 Subject: [PATCH 03/17] add __getattr__ --- sklearn/utils/_namedarray.py | 3 +++ sklearn/utils/tests/test_namedarray.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index de910b7e67617..a7b7abfce1bbc 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -55,6 +55,9 @@ def _col_count(self, value): else: return value.shape[1] + def __getattr__(self, name): + return getattr(self._data, name) + def __getitem__(self, slice): return self.data[slice] diff --git a/sklearn/utils/tests/test_namedarray.py b/sklearn/utils/tests/test_namedarray.py index a317bc5e64c8f..daf51a3db7869 100644 --- a/sklearn/utils/tests/test_namedarray.py +++ b/sklearn/utils/tests/test_namedarray.py @@ -25,3 +25,10 @@ def test_validation(): assert x.feature_names is not None x.data = np.ones(shape=(4, 4)) assert x.feature_names is None + + +def test_getattr(): + x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) + # these would fail if __getattr__ doesn't work + x.ndim + x.shape From 68048f5e2f94b40635316e9d9c8558f6d7842e0c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 16:11:44 +0200 Subject: [PATCH 04/17] revert validation changes --- sklearn/utils/validation.py | 43 +------------------------------------ 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index d0fcf1c0310e2..2150a54ad75d9 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -15,6 +15,7 @@ import scipy.sparse as sp from distutils.version import LooseVersion from inspect import signature + from numpy.core.numeric import ComplexWarning import joblib @@ -432,10 +433,6 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, "warning.", DeprecationWarning, stacklevel=2) - # duck-typing to avoid a circular import - if hasattr(array, "_data") and hasattr(array, "_feature_names"): - array = array._data - # store reference to original array to check if copy is needed when # function returns array_orig = array @@ -989,44 +986,6 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _feature_names(X): - if isinstance(X, NamedArray): - return X.features - elif hasattr(X, 'columns'): - return X.columns - else: - return None - - -def _check_NamedArray(X, feature_names): - # need to ignore feature names on a bunch of input types - # not sure how to handle this. - if feature_names is None and ( - isinstance(X, list) or - isinstance(X, tuple) or - isinstance(X, dict)): - print("returning input") - return X - - if feature_names is None: - feature_names = ['x%s' % i for i in range(X.shape[1])] - - if isinstance(X, NamedArray): - X.features = feature_names - else: - X = NamedArray(X, feature_names) - - return X - - -def _check_feature_names(X, fitted_estimator): - if not hasattr(fitted_estimator, 'feature_names_out_'): - return X - - return _check_NamedArray( - X, fitted_estimator.feature_names_out_) - - def _check_sample_weight(sample_weight, X, dtype=None): """Validate sample weights. From e9fe981daf41b875e2e0cc7c65a978063265d59b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 16:24:47 +0200 Subject: [PATCH 05/17] fix py35 --- sklearn/utils/_namedarray.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index a7b7abfce1bbc..08957a0f45d9a 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -69,4 +69,5 @@ def __repr__(self): base_repr = np.array2string(self.data, prefix=prefix) return (prefix + base_repr - + f',\n feature_names={self.feature_names!r})') + + ',\n feature_names={})'.format( + str(self.feature_names))) From da0b186cb48df15951e7b670652dae2dd9c9be09 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 16:42:38 +0200 Subject: [PATCH 06/17] support numpy 1.11 --- sklearn/utils/_namedarray.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index 08957a0f45d9a..1efa8e2387537 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -4,7 +4,14 @@ # License: BSD 3 clause import numpy as np -from numpy.lib.mixins import NDArrayOperatorsMixin + +# NDArrayOperatorsMixin was added in numpy 1.13 +# TODO: cleanup once we support numpy 1.13+ +try: + from numpy.lib.mixins import NDArrayOperatorsMixin +except ImportError: + from .mixins import NDArrayOperatorsMixin + from .validation import check_array, column_or_1d From 47272a6cfda180b6bfe9299f7dd56cd8464f4b89 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 16:53:51 +0200 Subject: [PATCH 07/17] adding docstring --- sklearn/utils/_namedarray.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index 1efa8e2387537..c26b65fc9f3a9 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -16,6 +16,23 @@ class NamedArray(NDArrayOperatorsMixin): + """A wrapper to a numpy ndarray holding some metadata about the data. + + Instances of this object behave like a numpy array, and loose all metadata + information in numerical operations. + + Parameters + ---------- + data: array-like + A one or two dimensional array like data. + + feature_names: list or array of strings, or None, default=None + Feature names associated with the columns of the data. The number of + columns should always be the same as the number of feature names. + Setting the `data` of an instance, would result in `feature_names` to + be `None` if the number of columns do not match the number of stored + feature names. + """ _feature_names = None _data = None From 7c018463f8a9bc2939a73964b5d552f59141cc23 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 17:19:58 +0200 Subject: [PATCH 08/17] add pandas, not supporting numpy<1.13 --- sklearn/utils/_namedarray.py | 8 +++++++- sklearn/utils/tests/test_namedarray.py | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index c26b65fc9f3a9..65517f9416817 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -10,7 +10,8 @@ try: from numpy.lib.mixins import NDArrayOperatorsMixin except ImportError: - from .mixins import NDArrayOperatorsMixin + raise NotImplementedError("In order to use NamedAraay, please upgrade your " + "numpy to 1.13+!") from .validation import check_array, column_or_1d @@ -95,3 +96,8 @@ def __repr__(self): return (prefix + base_repr + ',\n feature_names={})'.format( str(self.feature_names))) + + def todataframe(self): + """Returns a `pandas.DataFrame` with set column names.""" + import pandas as pd + return pd.DataFrame(self._data, columns=self.feature_names) diff --git a/sklearn/utils/tests/test_namedarray.py b/sklearn/utils/tests/test_namedarray.py index daf51a3db7869..6705106572a12 100644 --- a/sklearn/utils/tests/test_namedarray.py +++ b/sklearn/utils/tests/test_namedarray.py @@ -32,3 +32,9 @@ def test_getattr(): # these would fail if __getattr__ doesn't work x.ndim x.shape + + +def test_pandas(): + _ = pytest.importorskip("pandas") + x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) + assert all(x.todataframe().columns == ['a', 'b', 'c']) From dcfa67ed9b20e83c6460e0a649ad2ad1f640fa2f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 18:51:13 +0200 Subject: [PATCH 09/17] add more tests and implement most of the comments --- sklearn/utils/_namedarray.py | 31 ++++++-------------------- sklearn/utils/tests/test_namedarray.py | 22 +++++++++++++----- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index 65517f9416817..bcad06ecf8991 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -34,28 +34,12 @@ class NamedArray(NDArrayOperatorsMixin): be `None` if the number of columns do not match the number of stored feature names. """ - _feature_names = None - _data = None def __init__(self, data, feature_names=None): - self.data = data + data = check_array(data, ensure_2d=False) + self._data = data self.feature_names = feature_names - @property - def data(self): - return self._data - - @data.setter - def data(self, value): - value = check_array(value) - self._data = value - - if self.feature_names is None: - return - - if len(self.feature_names) != self._col_count(value): - self._feature_names = None - @property def feature_names(self): return self._feature_names @@ -66,8 +50,10 @@ def feature_names(self, value): self._feature_names = None return + if np.isscalar(value): + value = [value] value = column_or_1d(value) - col_count = self._col_count(self.data) + col_count = self._col_count(self._data) if len(value) != col_count: raise ValueError("{} column names provided, but data has {} " "columns".format(len(value), col_count)) @@ -84,14 +70,11 @@ def __getattr__(self, name): return getattr(self._data, name) def __getitem__(self, slice): - return self.data[slice] - - def __array__(self, *args, **kwargs): - return self.data.__array__(*args, **kwargs) + return self._data[slice] def __repr__(self): prefix = self.__class__.__name__ + '(' - base_repr = np.array2string(self.data, + base_repr = np.array2string(self._data, prefix=prefix) return (prefix + base_repr + ',\n feature_names={})'.format( diff --git a/sklearn/utils/tests/test_namedarray.py b/sklearn/utils/tests/test_namedarray.py index 6705106572a12..a02aacc693b48 100644 --- a/sklearn/utils/tests/test_namedarray.py +++ b/sklearn/utils/tests/test_namedarray.py @@ -11,6 +11,9 @@ def test_basics(): assert not isinstance(x + 1, NamedArray) assert not isinstance(x + x, NamedArray) assert not isinstance(x + np.ones(shape=(5, 3)), NamedArray) + assert x[0, :].shape == (3,) + assert x[:, 0].shape == (5,) + assert x[0:2].shape == (2, 3) def test_validation(): @@ -20,12 +23,6 @@ def test_validation(): # allow None as feature_names NamedArray(np.ones(shape=(3, 3))) - x = NamedArray(np.ones(shape=(3, 3)), feature_names=[1, 2, 3]) - x.data = np.ones(shape=(4, 3)) - assert x.feature_names is not None - x.data = np.ones(shape=(4, 4)) - assert x.feature_names is None - def test_getattr(): x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) @@ -38,3 +35,16 @@ def test_pandas(): _ = pytest.importorskip("pandas") x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c']) assert all(x.todataframe().columns == ['a', 'b', 'c']) + + +def test_1d(): + x = NamedArray(np.array([1, 2]), feature_names=['a']) + assert x.feature_names == ['a'] + + +def test_repr(): + repr_ = ("NamedArray([[1 2]\n" + " [3 4]],\n" + " feature_names=['a' 'b'])") + x = NamedArray([[1, 2], [3, 4]], feature_names=['a', 'b']) + assert repr(x) == repr_ From 5b47f06acdcd07e18a84adbd1c7b2345dea1db4c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Jul 2019 21:13:06 +0200 Subject: [PATCH 10/17] implement __dir__ --- sklearn/utils/_namedarray.py | 3 +++ sklearn/utils/tests/test_namedarray.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index bcad06ecf8991..e6b50a1302294 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -69,6 +69,9 @@ def _col_count(self, value): def __getattr__(self, name): return getattr(self._data, name) + def __dir__(self): + return list(set(dir(NamedArray)).union(set(dir(self._data)))) + def __getitem__(self, slice): return self._data[slice] diff --git a/sklearn/utils/tests/test_namedarray.py b/sklearn/utils/tests/test_namedarray.py index a02aacc693b48..b4ac438672d92 100644 --- a/sklearn/utils/tests/test_namedarray.py +++ b/sklearn/utils/tests/test_namedarray.py @@ -48,3 +48,9 @@ def test_repr(): " feature_names=['a' 'b'])") x = NamedArray([[1, 2], [3, 4]], feature_names=['a', 'b']) assert repr(x) == repr_ + + +def test_numpy_attrs(): + a = np.ones(shape=(1)) + x = NamedArray(a, feature_names='a') + assert set(dir(a)) < set(dir(x)) From 42bdf9f7b7939486894d28db00131dede6d33747 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 9 Sep 2019 16:36:27 +0200 Subject: [PATCH 11/17] adding sparse named array --- sklearn/utils/_namedarray.py | 109 ++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 26 deletions(-) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index e6b50a1302294..c09e0c9fe4224 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -4,42 +4,20 @@ # License: BSD 3 clause import numpy as np +import scipy as sp # NDArrayOperatorsMixin was added in numpy 1.13 # TODO: cleanup once we support numpy 1.13+ try: from numpy.lib.mixins import NDArrayOperatorsMixin except ImportError: - raise NotImplementedError("In order to use NamedAraay, please upgrade your " - "numpy to 1.13+!") + raise NotImplementedError("In order to use NamedAraay, please upgrade your" + " numpy to 1.13+!") from .validation import check_array, column_or_1d -class NamedArray(NDArrayOperatorsMixin): - """A wrapper to a numpy ndarray holding some metadata about the data. - - Instances of this object behave like a numpy array, and loose all metadata - information in numerical operations. - - Parameters - ---------- - data: array-like - A one or two dimensional array like data. - - feature_names: list or array of strings, or None, default=None - Feature names associated with the columns of the data. The number of - columns should always be the same as the number of feature names. - Setting the `data` of an instance, would result in `feature_names` to - be `None` if the number of columns do not match the number of stored - feature names. - """ - - def __init__(self, data, feature_names=None): - data = check_array(data, ensure_2d=False) - self._data = data - self.feature_names = feature_names - +class FeatureNamesMixin: @property def feature_names(self): return self._feature_names @@ -66,6 +44,31 @@ def _col_count(self, value): else: return value.shape[1] + +class NamedArray(FeatureNamesMixin, NDArrayOperatorsMixin): + """A wrapper to a numpy ndarray holding some metadata about the data. + + Instances of this object behave like a numpy array, and loose all metadata + information in numerical operations. + + Parameters + ---------- + data: array-like + A one or two dimensional array like data. + + feature_names: list or array of strings, or None, default=None + Feature names associated with the columns of the data. The number of + columns should always be the same as the number of feature names. + Setting the `data` of an instance, would result in `feature_names` to + be `None` if the number of columns do not match the number of stored + feature names. + """ + + def __init__(self, data, feature_names=None): + data = check_array(data, ensure_2d=False) + self._data = data + self.feature_names = feature_names + def __getattr__(self, name): return getattr(self._data, name) @@ -87,3 +90,57 @@ def todataframe(self): """Returns a `pandas.DataFrame` with set column names.""" import pandas as pd return pd.DataFrame(self._data, columns=self.feature_names) + + +class SparseNamedArrayMixin(FeatureNamesMixin): + def __init__(self, *args, feature_names=None, **kwargs): + super().__init__(*args, **kwargs) + self.feature_names = feature_names + + def __repr__(self): + res = super().__repr__() + res += "\nfeature names: %s" % repr(self._feature_names) + return res + + +# We need a class per sparse matrix type, hence the following 7 classes. +class SparseNamedArrayCSR(SparseNamedArrayMixin, sp.sparse.csr_matrix): + pass + + +class SparseNamedArrayCSC(SparseNamedArrayMixin, sp.sparse.csc_matrix): + pass + + +class SparseNamedArrayBSR(SparseNamedArrayMixin, sp.sparse.bsr_matrix): + pass + + +class SparseNamedArrayLIL(SparseNamedArrayMixin, sp.sparse.lil_matrix): + pass + + +class SparseNamedArrayDOK(SparseNamedArrayMixin, sp.sparse.dok_matrix): + pass + + +class SparseNamedArrayDIA(SparseNamedArrayMixin, sp.sparse.dia_matrix): + pass + + +class SparseNamedArrayCOO(SparseNamedArrayMixin, sp.sparse.coo_matrix): + pass + + +def make_namedarray(X, feature_names): + types = {'csr': SparseNamedArrayCSR, + 'csc': SparseNamedArrayCSC, + 'bsr': SparseNamedArrayBSR, + 'lil': SparseNamedArrayLIL, + 'dok': SparseNamedArrayDOK, + 'dia': SparseNamedArrayDIA, + 'coo': SparseNamedArrayCOO} + if sp.sparse.issparse(X): + return types[X.format](X, feature_names=feature_names, copy=False) + else: + return NamedArray(X, feature_names=feature_names) From 293dbba5f02c608ab2824b1444d4fc2d5e5a6a8b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 6 Jan 2020 16:13:51 +0100 Subject: [PATCH 12/17] fix typo and remove merge note from __init__ --- sklearn/utils/__init__.py | 1 - sklearn/utils/_namedarray.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 6a5162edfb972..de0f2bb4a57ea 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -54,7 +54,6 @@ "resample", "shuffle", "check_matplotlib_support", "all_estimators", "NamedArray", "make_namedarray" ] ->>>>>>> upstream/master IS_PYPY = platform.python_implementation() == 'PyPy' _IS_32BIT = 8 * struct.calcsize("P") == 32 diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index c09e0c9fe4224..a8d2f7d1f7c61 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -48,7 +48,7 @@ def _col_count(self, value): class NamedArray(FeatureNamesMixin, NDArrayOperatorsMixin): """A wrapper to a numpy ndarray holding some metadata about the data. - Instances of this object behave like a numpy array, and loose all metadata + Instances of this object behave like a numpy array, and lose all metadata information in numerical operations. Parameters From a498e4db15571ae9e25e949578bf7618223fc47c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 6 Jan 2020 16:32:56 +0100 Subject: [PATCH 13/17] understand pandas feature names --- sklearn/utils/_namedarray.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index a8d2f7d1f7c61..747ad2ab1c1d1 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -65,6 +65,8 @@ class NamedArray(FeatureNamesMixin, NDArrayOperatorsMixin): """ def __init__(self, data, feature_names=None): + if hasattr(data, 'columns') and feature_names is None: + feature_names = list(data.columns) data = check_array(data, ensure_2d=False) self._data = data self.feature_names = feature_names From 3d70e72858ad6ace23fc22f831214f56a2d37708 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 6 Jan 2020 16:55:06 +0100 Subject: [PATCH 14/17] fix the sparse case --- sklearn/utils/_namedarray.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index 747ad2ab1c1d1..30b537d4decdf 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -97,6 +97,7 @@ def todataframe(self): class SparseNamedArrayMixin(FeatureNamesMixin): def __init__(self, *args, feature_names=None, **kwargs): super().__init__(*args, **kwargs) + self._data = self self.feature_names = feature_names def __repr__(self): From d122adb063fbb97028d9e117c4f89ddcc33b4d22 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 7 Jan 2020 14:51:54 +0100 Subject: [PATCH 15/17] add force_sparse and sparse from df --- sklearn/utils/_namedarray.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py index 30b537d4decdf..b30e526b47661 100644 --- a/sklearn/utils/_namedarray.py +++ b/sklearn/utils/_namedarray.py @@ -105,6 +105,12 @@ def __repr__(self): res += "\nfeature names: %s" % repr(self._feature_names) return res + def todataframe(self): + """Returns a `pandas.DataFrame` with set column names.""" + import pandas as pd + return pd.DataFrame.sparse.from_spmatrix(self, + columns=self.feature_names) + # We need a class per sparse matrix type, hence the following 7 classes. class SparseNamedArrayCSR(SparseNamedArrayMixin, sp.sparse.csr_matrix): @@ -135,7 +141,7 @@ class SparseNamedArrayCOO(SparseNamedArrayMixin, sp.sparse.coo_matrix): pass -def make_namedarray(X, feature_names): +def make_namedarray(X, feature_names=None, force_sparse=None): types = {'csr': SparseNamedArrayCSR, 'csc': SparseNamedArrayCSC, 'bsr': SparseNamedArrayBSR, @@ -143,7 +149,16 @@ def make_namedarray(X, feature_names): 'dok': SparseNamedArrayDOK, 'dia': SparseNamedArrayDIA, 'coo': SparseNamedArrayCOO} + if hasattr(X, 'columns') and feature_names is None: + feature_names = list(X.columns) + + format = None if sp.sparse.issparse(X): - return types[X.format](X, feature_names=feature_names, copy=False) + format = X.format + elif force_sparse: + format = force_sparse + + if format: + return types[format](X, feature_names=feature_names, copy=False) else: return NamedArray(X, feature_names=feature_names) From 169cbb387606682295a30be28ccc1a4259f667e2 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 7 Jan 2020 14:57:34 +0100 Subject: [PATCH 16/17] add sparse types to init --- sklearn/utils/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index de0f2bb4a57ea..5a84b42e5e69f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -31,6 +31,13 @@ check_consistent_length, check_X_y, indexable, check_symmetric, check_scalar) from ._namedarray import NamedArray, make_namedarray +from ._namedarray import (SparseNamedArrayCSR, + SparseNamedArrayCSC, + SparseNamedArrayBSR, + SparseNamedArrayLIL, + SparseNamedArrayDOK, + SparseNamedArrayDIA, + SparseNamedArrayCOO) from .. import get_config @@ -52,7 +59,10 @@ "check_symmetric", "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", "resample", "shuffle", "check_matplotlib_support", "all_estimators", - "NamedArray", "make_namedarray" + "NamedArray", "make_namedarray", "SparseNamedArrayCSR", + "SparseNamedArrayCSC", "SparseNamedArrayBSR", + "SparseNamedArrayLIL", "SparseNamedArrayDOK", + "SparseNamedArrayDIA", "SparseNamedArrayCOO", ] IS_PYPY = platform.python_implementation() == 'PyPy' From a6fbfbdfa7653f270c72b5536055202069f99615 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 7 Jan 2020 14:59:13 +0100 Subject: [PATCH 17/17] pep8 --- sklearn/utils/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 5a84b42e5e69f..faa038192231a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -62,8 +62,7 @@ "NamedArray", "make_namedarray", "SparseNamedArrayCSR", "SparseNamedArrayCSC", "SparseNamedArrayBSR", "SparseNamedArrayLIL", "SparseNamedArrayDOK", - "SparseNamedArrayDIA", "SparseNamedArrayCOO", - ] + "SparseNamedArrayDIA", "SparseNamedArrayCOO"] IS_PYPY = platform.python_implementation() == 'PyPy' _IS_32BIT = 8 * struct.calcsize("P") == 32