Skip to content

[MRG] Remove _check_1d_array and _is_1d private function #2002

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
177 changes: 57 additions & 120 deletions sklearn/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,102 +34,60 @@
###############################################################################
# General utilities
###############################################################################
def _is_1d(x):
"""Return True if x is 1d or a column vector
def _column_or_1d(y):
""" Ravel column or 1d numpy array, else raises an error

Parameters
----------
x : numpy array.
y : array-like

Returns
-------
is_1d : boolean,
Return True if x can be considered as a 1d vector.

Examples
--------
>>> import numpy as np
>>> from sklearn.metrics.metrics import _is_1d
>>> _is_1d([1, 2, 3])
True
>>> _is_1d(np.array([1, 2, 3]))
True
>>> _is_1d([[1, 2, 3]])
False
>>> _is_1d(np.array([[1, 2, 3]]))
False
>>> _is_1d([[1], [2], [3]])
True
>>> _is_1d(np.array([[1], [2], [3]]))
True
>>> _is_1d([[1, 2], [3, 4]])
False
>>> _is_1d(np.array([[1, 2], [3, 4]]))
False

See also
--------
_check_1d_array
y : array

"""
shape = np.shape(x)
return len(shape) == 1 or len(shape) == 2 and shape[1] == 1

shape = np.shape(y)
if len(shape) == 1 or (len(shape) == 2 and shape[1] == 1):
return np.ravel(y)
raise ValueError("bad input shape {0}".format(shape))

def _check_1d_array(y1, y2, ravel=False):
"""Check that y1 and y2 are vectors of the same shape.

It convert 1d arrays (y1 and y2) of various shape to a common shape
representation. Note that ``y1`` and ``y2`` should have the same number of
elements.
def _check_reg_targets(y_true, y_pred):
"""Check that y_true and y_pred belong to the same regression task

Parameters
----------
y1 : array-like,
y1 must be a "vector".

y2 : array-like
y2 must be a "vector".
y_true : array-like,

ravel : boolean, optional (default=False),
If ``ravel``` is set to ``True``, then ``y1`` and ``y2`` are raveled.
y_pred : array-like,

Returns
-------
y1 : numpy array,
If ``ravel`` is set to ``True``, return np.ravel(y1), else
return y1.

y2 : numpy array,
Return y2 reshaped to have the shape of y1.

Examples
--------
>>> from sklearn.metrics.metrics import _check_1d_array
>>> _check_1d_array([1, 2], [[3], [4]])
(array([1, 2]), array([3, 4]))
type_true : one of {'continuous', continuous-multioutput'}
The type of the true target data, as output by
``utils.multiclass.type_of_target``

See also
--------
_is_1d
y_true : array-like of shape = [n_samples, n_outputs]
Ground truth (correct) target values.

y_pred : array-like of shape = [n_samples, n_outputs]
Estimated target values.
"""
y1 = np.asarray(y1)
y2 = np.asarray(y2)
y_true, y_pred = check_arrays(y_true, y_pred)

if not _is_1d(y1):
raise ValueError("y1 can't be considered as a vector")
if y_true.ndim == 1:
y_true = y_true.reshape((-1, 1))

if not _is_1d(y2):
raise ValueError("y2 can't be considered as a vector")
if y_pred.ndim == 1:
y_pred = y_pred.reshape((-1, 1))

if ravel:
return np.ravel(y1), np.ravel(y2)
else:
if np.shape(y1) != np.shape(y2):
y2 = np.reshape(y2, np.shape(y1))
if y_true.shape[1] != y_pred.shape[1]:
raise ValueError("y_true and y_pred have different number of output "
"({0}!={1})".format(y_true.shape[1], y_true.shape[1]))

y_type = 'continuous' if y_true.shape[1] == 1 else 'continuous-multioutput'

return y1, y2
return y_type, y_true, y_pred


def _check_clf_targets(y_true, y_pred):
Expand Down Expand Up @@ -181,8 +139,8 @@ def _check_clf_targets(y_true, y_pred):
# 'binary' can be removed
type_true = type_pred = 'multiclass'

y_true = np.ravel(y_true)
y_pred = np.ravel(y_pred)
y_true = _column_or_1d(y_true)
y_pred = _column_or_1d(y_pred)

else:
raise ValueError("Can't handle %s/%s targets" % (type_true, type_pred))
Expand Down Expand Up @@ -466,8 +424,10 @@ def matthews_corrcoef(y_true, y_pred):
-0.33...

"""
y_true, y_pred = check_arrays(y_true, y_pred)
y_true, y_pred = _check_1d_array(y_true, y_pred, ravel=True)
y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)

if y_type != "binary":
raise ValueError("%s is not supported" % y_type)

mcc = np.corrcoef(y_true, y_pred)[0, 1]
if np.isnan(mcc):
Expand Down Expand Up @@ -508,7 +468,8 @@ def _binary_clf_curve(y_true, y_score, pos_label=None):
Decreasing score values.
"""
y_true, y_score = check_arrays(y_true, y_score)
y_true, y_score = _check_1d_array(y_true, y_score, ravel=True)
y_true = _column_or_1d(y_true)
y_score = _column_or_1d(y_score)

# ensure binary classification if pos_label is not specified
classes = np.unique(y_true)
Expand Down Expand Up @@ -744,8 +705,9 @@ def confusion_matrix(y_true, y_pred, labels=None):
[1, 0, 2]])

"""
y_true, y_pred = check_arrays(y_true, y_pred)
y_true, y_pred = _check_1d_array(y_true, y_pred, ravel=True)
y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
if y_type not in ("binary", "multiclass"):
raise ValueError("%s is not supported" % y_type)

if labels is None:
labels = unique_labels(y_true, y_pred)
Expand Down Expand Up @@ -833,20 +795,13 @@ def zero_one_loss(y_true, y_pred, normalize=True):


"""
y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
score = accuracy_score(y_true, y_pred,
normalize=normalize)

if normalize:
return 1 - score
else:
if hasattr(y_true, "shape"):
n_samples = (np.max(y_true.shape) if _is_1d(y_true)
else y_true.shape[0])

else:
n_samples = len(y_true)

n_samples = len(y_true)
return n_samples - score


Expand Down Expand Up @@ -1074,6 +1029,7 @@ def accuracy_score(y_true, y_pred, normalize=True):
0.0

"""

# Compute accuracy for each possible representation
y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
if y_type == 'multilabel-indicator':
Expand Down Expand Up @@ -1422,9 +1378,9 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None):
else:
labels = np.asarray(labels)
n_labels = labels.size
true_pos = np.zeros((n_labels), dtype=np.int)
false_pos = np.zeros((n_labels), dtype=np.int)
false_neg = np.zeros((n_labels), dtype=np.int)
true_pos = np.zeros((n_labels,), dtype=np.int)
false_pos = np.zeros((n_labels,), dtype=np.int)
false_neg = np.zeros((n_labels,), dtype=np.int)

if y_type == 'multilabel-indicator':
true_pos = np.sum(np.logical_and(y_true == 1,
Expand Down Expand Up @@ -1454,12 +1410,7 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None):
false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i)

# Compute the true_neg using the tp, fp and fn
if hasattr(y_true, "shape"):
n_samples = (np.max(y_true.shape) if _is_1d(y_true)
else y_true.shape[0])
else:
n_samples = len(y_true)

n_samples = len(y_true)
true_neg = n_samples - true_pos - false_pos - false_neg

return true_pos, true_neg, false_pos, false_neg
Expand Down Expand Up @@ -2192,6 +2143,7 @@ def hamming_loss(y_true, y_pred, classes=None):

"""
y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)

if classes is None:
classes = unique_labels(y_true, y_pred)
else:
Expand All @@ -2200,10 +2152,10 @@ def hamming_loss(y_true, y_pred, classes=None):
if y_type == 'multilabel-indicator':
return np.mean(y_true != y_pred)
elif y_type == 'multilabel-sequences':
loss = np.array([len(set(pred) ^ set(true))
for pred, true in zip(y_pred, y_true)])
loss = np.array([len(set(pred) ^ set(true))
for pred, true in zip(y_pred, y_true)])

return np.mean(loss) / np.size(classes)
return np.mean(loss) / np.size(classes)

else:
return sp_hamming(y_true, y_pred)
Expand Down Expand Up @@ -2241,12 +2193,7 @@ def mean_absolute_error(y_true, y_pred):
0.75

"""
y_true, y_pred = check_arrays(y_true, y_pred)

# Handle mix 1d representation
if _is_1d(y_true):
y_true, y_pred = _check_1d_array(y_true, y_pred)

y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
return np.mean(np.abs(y_pred - y_true))


Expand Down Expand Up @@ -2279,12 +2226,7 @@ def mean_squared_error(y_true, y_pred):
0.708...

"""
y_true, y_pred = check_arrays(y_true, y_pred)

# Handle mix 1d representation
if _is_1d(y_true):
y_true, y_pred = _check_1d_array(y_true, y_pred)

y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
return np.mean((y_pred - y_true) ** 2)


Expand Down Expand Up @@ -2322,11 +2264,10 @@ def explained_variance_score(y_true, y_pred):
0.957...

"""
y_true, y_pred = check_arrays(y_true, y_pred)
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)

# Handle mix 1d representation
if _is_1d(y_true):
y_true, y_pred = _check_1d_array(y_true, y_pred)
if y_type != "continuous":
raise ValueError("{0} is not supported".format(y_type))

numerator = np.var(y_true - y_pred)
denominator = np.var(y_true)
Expand Down Expand Up @@ -2383,11 +2324,7 @@ def r2_score(y_true, y_pred):
0.938...

"""
y_true, y_pred = check_arrays(y_true, y_pred)

# Handle mix 1d representation
if _is_1d(y_true):
y_true, y_pred = _check_1d_array(y_true, y_pred, ravel=True)
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)

if len(y_true) == 1:
raise ValueError("r2_score can only be computed given more than one"
Expand Down
Loading