Skip to content

[MRG + 1] ENH: preprocess: adding a max-normalization option #4695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 11, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from ..utils.fixes import isclose
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)
from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis,
min_max_axis)
from ..utils.validation import check_is_fitted

zip = six.moves.zip
Expand Down Expand Up @@ -570,7 +571,7 @@ def normalize(X, norm='l2', axis=1, copy=True):
scipy.sparse matrices should be in CSR format to avoid an
un-necessary copy.

norm : 'l1' or 'l2', optional ('l2' by default)
norm : 'l1', 'l2', or 'max', optional ('l2' by default)
The norm to use to normalize each non zero sample (or each non-zero
feature if axis is 0).

Expand All @@ -589,7 +590,7 @@ def normalize(X, norm='l2', axis=1, copy=True):
using the ``Transformer`` API (e.g. as part of a preprocessing
:class:`sklearn.pipeline.Pipeline`)
"""
if norm not in ('l1', 'l2'):
if norm not in ('l1', 'l2', 'max'):
raise ValueError("'%s' is not a supported norm" % norm)

if axis == 0:
Expand All @@ -609,13 +610,19 @@ def normalize(X, norm='l2', axis=1, copy=True):
inplace_csr_row_normalize_l1(X)
elif norm == 'l2':
inplace_csr_row_normalize_l2(X)
elif norm == 'max':
_, norms = min_max_axis(X, 1)
norms = norms.repeat(np.diff(X.indptr))
mask = norms != 0
X.data[mask] /= norms[mask]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like we should raise a ValueError "else". If you could add that and add a test, it would be much appreciated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind, missed the check above.

else:
if norm == 'l1':
norms = np.abs(X).sum(axis=1)
norms[norms == 0.0] = 1.0
elif norm == 'l2':
norms = row_norms(X)
norms[norms == 0.0] = 1.0
elif norm == 'max':
norms = np.max(X, axis=1)
norms[norms == 0.0] = 1.0
X /= norms[:, np.newaxis]

if axis == 0:
Expand Down Expand Up @@ -643,7 +650,7 @@ class Normalizer(BaseEstimator, TransformerMixin):

Parameters
----------
norm : 'l1' or 'l2', optional ('l2' by default)
norm : 'l1', 'l2', or 'max', optional ('l2' by default)
The norm to use to normalize each non zero sample.

copy : boolean, optional, default True
Expand Down
49 changes: 49 additions & 0 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,55 @@ def test_normalizer_l2():
assert_almost_equal(la.norm(X_norm[3]), 0.0)


def test_normalizer_max():
rng = np.random.RandomState(0)
X_dense = rng.randn(4, 5)
X_sparse_unpruned = sparse.csr_matrix(X_dense)

# set the row number 3 to zero
X_dense[3, :] = 0.0

# set the row number 3 to zero without pruning (can happen in real life)
indptr_3 = X_sparse_unpruned.indptr[3]
indptr_4 = X_sparse_unpruned.indptr[4]
X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0

# build the pruned variant using the regular constructor
X_sparse_pruned = sparse.csr_matrix(X_dense)

# check inputs that support the no-copy optim
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not entirely sure I understand the test. You never check if the no-copy optimization actually works, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just copied the test structure from the 'l1' and 'l2' test cases, so if this is wrong, those are as well. I think it's checking the copy=False case correctly, though, when it asserts that (X_norm2 is X)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right. I overlooked that. Seems fine.

for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):

normalizer = Normalizer(norm='max', copy=True)
X_norm1 = normalizer.transform(X)
assert_true(X_norm1 is not X)
X_norm1 = toarray(X_norm1)

normalizer = Normalizer(norm='max', copy=False)
X_norm2 = normalizer.transform(X)
assert_true(X_norm2 is X)
X_norm2 = toarray(X_norm2)

for X_norm in (X_norm1, X_norm2):
row_maxs = X_norm.max(axis=1)
for i in range(3):
assert_almost_equal(row_maxs[i], 1.0)
assert_almost_equal(row_maxs[3], 0.0)

# check input for which copy=False won't prevent a copy
for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
X = init(X_dense)
X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)

assert_true(X_norm is not X)
assert_true(isinstance(X_norm, sparse.csr_matrix))

X_norm = toarray(X_norm)
for i in range(3):
assert_almost_equal(row_maxs[i], 1.0)
assert_almost_equal(la.norm(X_norm[3]), 0.0)


def test_normalize():
# Test normalize function
# Only tests functionality not used by the tests for Normalizer.
Expand Down