Skip to content

[MRG] Fused type makedataset #9040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,7 @@ benchmarks/bench_covertype_data/

# Used by py.test
.cache

# files generated from a template
sklearn/utils/seq.dataset.pyx
sklearn/utils/seq.dataset.pxd
14 changes: 11 additions & 3 deletions sklearn/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from ..utils.extmath import safe_sparse_dot
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
from ..utils.fixes import sparse_lsqr
from ..utils.seq_dataset import ArrayDataset32, CSRDataset32
from ..utils.seq_dataset import ArrayDataset, CSRDataset
from ..utils.validation import check_is_fitted
from ..exceptions import NotFittedError
Expand All @@ -56,12 +57,19 @@ def make_dataset(X, y, sample_weight, random_state=None):
# seed should never be 0 in SequentialDataset
seed = rng.randint(1, np.iinfo(np.int32).max)

if X.dtype == np.float32:
CSRData = CSRDataset32
ArrayData = ArrayDataset32
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The codecov chrome/firefox extension tells me that those lines are not covered. Please add a test for that case (and install the codecov browser extension ;).

This test should probably fit a SGDClassifier on 32 bit float iris and check that the coef_ attribute should be 32 bit float as well ( and the output of decision_function should also output a float32 array).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, there isn't any specific test for make_dataset. It's supposed to be covered by the tests for the sag solver. So we can either put the test off to a next PR about the sag solver (#9020) or doing the test you are proposing in this PR. What do you prefer?

else:
CSRData = CSRDataset
ArrayData = ArrayDataset

if sp.issparse(X):
dataset = CSRDataset(X.data, X.indptr, X.indices, y, sample_weight,
seed=seed)
dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight,
seed=seed)
intercept_decay = SPARSE_INTERCEPT_DECAY
else:
dataset = ArrayDataset(X, y, sample_weight, seed=seed)
dataset = ArrayData(X, y, sample_weight, seed=seed)
intercept_decay = 1.0

return dataset, intercept_decay
Expand Down
51 changes: 51 additions & 0 deletions sklearn/linear_model/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import ignore_warnings
Expand All @@ -18,10 +19,12 @@
from sklearn.linear_model.base import _preprocess_data
from sklearn.linear_model.base import sparse_center_data, center_data
from sklearn.linear_model.base import _rescale_data
from sklearn.linear_model.base import make_dataset
from sklearn.utils import check_random_state
from sklearn.utils.testing import assert_greater
from sklearn.datasets.samples_generator import make_sparse_uncorrelated
from sklearn.datasets.samples_generator import make_regression
from sklearn.datasets import load_iris

rng = np.random.RandomState(0)

Expand Down Expand Up @@ -407,3 +410,51 @@ def test_deprecation_center_data():
assert_array_almost_equal(X1_mean, X2_mean)
assert_array_almost_equal(X1_var, X2_var)
assert_array_almost_equal(y1_mean, y2_mean)


def test_fused_types_make_dataset():
iris = load_iris()

X_32 = iris.data.astype(np.float32)
y_32 = iris.target.astype(np.float32)
X_csr_32 = sparse.csr_matrix(X_32)
sample_weight_32 = np.arange(y_32.size, dtype=np.float32)

X_64 = iris.data.astype(np.float64)
y_64 = iris.target.astype(np.float64)
X_csr_64 = sparse.csr_matrix(X_64)
sample_weight_64 = np.arange(y_64.size, dtype=np.float64)

# array
dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
xi_32, yi_32, _, _ = dataset_32._next_py()
xi_64, yi_64, _, _ = dataset_64._next_py()
xi_data_32, _, _ = xi_32
xi_data_64, _, _ = xi_64

assert_equal(xi_data_32.dtype, np.float32)
assert_equal(xi_data_64.dtype, np.float64)
assert_equal(yi_32.dtype, np.float32)
assert_equal(yi_64.dtype, np.float64)
assert_array_almost_equal(yi_64, yi_32, decimal=5)

# csr
datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
xicsr_data_32, _, _ = xicsr_32
xicsr_data_64, _, _ = xicsr_64

assert_equal(xicsr_data_32.dtype, np.float32)
assert_equal(xicsr_data_64.dtype, np.float64)
assert_equal(yicsr_32.dtype, np.float32)
assert_equal(yicsr_64.dtype, np.float64)
assert_array_almost_equal(xicsr_data_64, xicsr_data_32, decimal=5)
assert_array_almost_equal(yicsr_64, yicsr_32, decimal=5)

assert_array_equal(xi_data_32, xicsr_data_32)
assert_array_equal(xi_data_64, xicsr_data_64)
assert_array_equal(yi_32, yicsr_32)
assert_array_equal(yi_64, yicsr_64)
51 changes: 0 additions & 51 deletions sklearn/utils/seq_dataset.pxd

This file was deleted.

85 changes: 85 additions & 0 deletions sklearn/utils/seq_dataset.pxd.tp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{{py:

"""
Dataset abstractions for sequential data access.

Template file for easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).

Generated file: seq_dataset.pxd

Each class is duplicated for all dtypes (float and double). The keywords
between double braces are substituted in setup.py.
"""

# name, c_type
dtypes = [('', 'double'),
('32', 'float')]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is much more clear if we use:

 dtypes = [('64', 'double'),
            ('32', 'float')]

I found an error complaining about SequentialDataset and I had a hard time to figure out that it was referring to the 64bits version. @Henley13 do you remember why we choose '' over '64'?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid refactoring. But I don't think it is a good idea


def get_dispatch(dtypes):
for name, c_type in dtypes:
yield name, c_type

}}

{{for name, c_type in get_dispatch(dtypes)}}

#------------------------------------------------------------------------------

"""
Dataset abstractions for sequential data access.
WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
"""

cimport numpy as np

# SequentialDataset and its two concrete subclasses are (optionally randomized)
# iterators over the rows of a matrix X and corresponding target values y.


cdef class SequentialDataset{{name}}:
cdef int current_index
cdef np.ndarray index
cdef int *index_data_ptr
cdef Py_ssize_t n_samples
cdef np.uint32_t seed

cdef void shuffle(self, np.uint32_t seed) nogil
cdef int _get_next_index(self) nogil
cdef int _get_random_index(self) nogil

cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
int current_index) nogil
cdef void next(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) nogil
cdef int random(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) nogil


cdef class ArrayDataset{{name}}(SequentialDataset{{name}}):
cdef np.ndarray X
cdef np.ndarray Y
cdef np.ndarray sample_weights
cdef Py_ssize_t n_features
cdef np.npy_intp X_stride
cdef {{c_type}} *X_data_ptr
cdef {{c_type}} *Y_data_ptr
cdef np.ndarray feature_indices
cdef int *feature_indices_ptr
cdef {{c_type}} *sample_weight_data


cdef class CSRDataset{{name}}(SequentialDataset{{name}}):
cdef np.ndarray X_data
cdef np.ndarray X_indptr
cdef np.ndarray X_indices
cdef np.ndarray Y
cdef np.ndarray sample_weights
cdef {{c_type}} *X_data_ptr
cdef int *X_indptr_ptr
cdef int *X_indices_ptr
cdef {{c_type}} *Y_data_ptr
cdef {{c_type}} *sample_weight_data

{{endfor}}
Loading