scikit-learn · marcelobeckmann · Dec 20, 2018 · Dec 20, 2018 · Apr 10, 2019 · Apr 25, 2019
diff --git a/doc/modules/metrics.rst b/doc/modules/metrics.rst
@@ -93,6 +93,46 @@ is equivalent to :func:`linear_kernel`, only slower.)
       Information Retrieval. Cambridge University Press.
       https://nlp.stanford.edu/IR-book/html/htmledition/the-vector-space-model-for-scoring-1.html
 
+.. _gower_distances:
+
+Gower distances
+-----------------
+The function :func:`gower_distances` computes the distances between the
-The function :func:`gower_distances` computes the distances between the
+The function :func:`~sklearn.metrics.pairwise.gower_distances` computes the distances between the
-The function :func:`gower_distances` computes the distances between the
+The function :func:`~sklearn.metrics.pairwise.gower_distances` computes the distances between the
+observations in X and Y, that may contain combinations of numerical, boolean,
+or categorical attributes, using an implementation of Gower Similarity.
+
+.. math::
+
+    g(\mathbf{x}, \mathbf{y}) = \frac{\sum_i(s(x_i, y_i))}{|\{i| x_i\text{ is not missing or }y_i\text{ is not missing}\}|}
+
+Where:
+
+x, y : array_like (1, n_features) are the observations to be compared.
-x, y : array_like (1, n_features) are the observations to be compared.
+x, y : two samples to be compared.
-x, y : array_like (1, n_features) are the observations to be compared.
+x, y : two samples to be compared.
+
+s(x, y) : Calculates the similarity of all features (for k = 1 to n_features)
+of x and y, as described by the expressions:
+
+    s(x_k, y_k) = 0, if k represents a boolean or categorical attribute,
+    and they are equal.
+
+    s(x_k, y_k) = 1, if k represents a boolean or categorical attribute,
+    and they are unequal.
+
+    s(x_k, y_k) = abs(x_k - y_k), if k represents a numerical attribute.
+
+    s(x_k, y_k) = 0, if x_k or y_k are missing.
+
+
+The Gower formula combines a Manhattan (L1) distance for numeric features
+with Hamming distance for categorical features to obtain a general coefficient
+for categorical and numeric data.
+
+.. topic:: References:
+
+    * Gower, J.C., 1971, A General Coefficient of Similarity and Some of Its 
+    Properties, Biometrics, Vol. 27, No. 4. (Dec., 1971), pp. 857-871.
-    Properties, Biometrics, Vol. 27, No. 4. (Dec., 1971), pp. 857-871.
+      Properties, Biometrics, Vol. 27, No. 4. (Dec., 1971), pp. 857-871.
-    Properties, Biometrics, Vol. 27, No. 4. (Dec., 1971), pp. 857-871.
+      Properties, Biometrics, Vol. 27, No. 4. (Dec., 1971), pp. 857-871.
+    http://members.cbio.mines-paristech.fr/~jvert/svn/bibli/local/Gower1971general.pdf
-    http://members.cbio.mines-paristech.fr/~jvert/svn/bibli/local/Gower1971general.pdf
+      http://members.cbio.mines-paristech.fr/~jvert/svn/bibli/local/Gower1971general.pdf
-    http://members.cbio.mines-paristech.fr/~jvert/svn/bibli/local/Gower1971general.pdf
+      http://members.cbio.mines-paristech.fr/~jvert/svn/bibli/local/Gower1971general.pdf
+
 .. _linear_kernel:
 
 Linear kernel

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
-
+from ..utils.fixes import _object_dtype_isnan
 
 # Utility Functions
 def _return_float_dtype(X, Y):
@@ -544,7 +544,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
         Valid values for metric are:
 
         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
+          'manhattan', 'gower']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -632,7 +632,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
         Valid values for metric are:
 
         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
+          'manhattan', 'gower']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -829,6 +829,232 @@ def cosine_distances(X, Y=None):
     return S
 
 
+def gower_distances(X, Y=None, categorical_features=None, scale=True):
+    """Compute the distances between the observations in X and Y,
+    that may contain mixed types of data, using an implementation
+    of Gower formula.
+
+    Parameters
+    ----------
+    X : array-like, or pandas.DataFrame, shape (n_samples, n_features)
+
+    Y : array-like, or pandas.DataFrame, optional,
+        shape (n_samples, n_features)
+
+    categorical_features : array-like, optional, shape (n_features)
+        Indicates with True/False whether a column is a categorical attribute.
+        This is useful when categorical atributes are represented as integer
+        values. Categorical ordinal attributes are treated as numeric, and
+        must be marked as false.
+
+        Alternatively, the categorical_features array can be represented only
+        with the numerical indexes of the categorical attribtes.
-        with the numerical indexes of the categorical attribtes.
+        with the numerical indexes of the categorical attributes.
-        with the numerical indexes of the categorical attribtes.
+        with the numerical indexes of the categorical attributes.
+
+        If the categorical_features array is not provided, by default all
+        non-numeric columns are considered categorical.
+
+    scale : boolean, list or array, optional (default=True)
+        Indicates if the numerical columns will be scaled between 0 and 1.
+        If false, it is assumed the numerical columns are already scaled.
+        If a list or array, it must countain the ranges of values from
+        numerical columns.
+
+    Returns
+    -------
+    similarities : ndarray, shape (n_samples_X, n_samples_Y)
+
+    References
+    ----------
+    Gower, J.C., 1971, A General Coefficient of Similarity and Some of Its
+    Properties.
+
+    Notes
+    -----
+    The numeric feature ranges are determined from both X and Y.
+
+    Current implementation does not support sparse matrices.
+
+    All the non-numerical types (e.g., str), are treated as categorical
+    features.
+
+    This implementation modifies the Gower's original similarity measure in
+    the folowing aspects:
+
+    * The values in the original similarity S range between 0 and 1. To
+    guarantee this, it is assumed the numerical features of X and Y are
+    scaled between 0 and 1.
+
+    * Different from the original similarity S, this implementation
+    returns 1-S.
+    """
+    if issparse(X) or issparse(Y):
+        raise TypeError("Gower distance does not support sparse matrices")
+
+    if not isinstance(scale, (bool, list, np.ndarray)):
+        raise TypeError("Parameter scale must be boolean, list, or ndarray")
+
+    if X is None or len(X) == 0:
+        raise ValueError("X can not be None or empty")
+
+    # It is necessary to convert to ndarray in advance to define the dtype
+    # as np.object, otherwise numeric columns will be converted to string
+    # if there are other string columns.
+    if not isinstance(X, np.ndarray):
+        X = np.asarray(X, dtype=np.object)
+
+    if Y is not None and not isinstance(Y, np.ndarray):
+        Y = np.asarray(Y, dtype=np.object)
+
+    X, Y = check_pairwise_arrays(X, Y, precomputed=False, dtype=X.dtype,
+                                 force_all_finite=False)
+
+    X = np.asarray(X, dtype=np.object)
+
+    cat_mask = _detect_categorical_features(X, categorical_features)
+    num_mask = ~ cat_mask
+
+    # Calculates the min and max values, and if requested, scale the
+    # input values in order to obtain the distances between 0 and 1,
+    # as proposed by the Gower's paper.
+    ranges = 1
+    if np.any(num_mask):
+        process_scale = False
+        if isinstance(scale, bool):
+            process_scale = scale
+        else:
+            if len(np.asarray(scale).flatten()) != X[:, num_mask].shape[1]:
+                raise ValueError("Length of scale parameter must be equal "
+                                 "to the number of numerical columns.")
+            process_scale = True
+
+        ranges, min, max = _precompute_gower_params(X, Y, scale, num_mask)
+
+        # avoid division by zero when all values in the column are the same
+        ranges[ranges == 0] = 1
+
+        # check if the data is pre-scaled when scale=False
+        if not process_scale and (np.min(min) < 0 or np.max(max) > 1):
+            raise ValueError("Input data is not scaled between 0 and 1.")
+
+    D = np.zeros((X.shape[0], Y.shape[0]), dtype=np.float)
+
+    for i in range(X.shape[0]):
+        j_start = i
+
+        # For non square results
+        if X.shape[0] != Y.shape[0] or X is not Y:
+            j_start = 0
+
+        # Makes the comparisson for np.nan for arrays with dtype=np.object,
+        # this is necessary as some deployments returns True for
+        # np.nan == np.nan
+        cat_nan_cols = (_object_dtype_isnan(X[i, cat_mask]) |
+                        _object_dtype_isnan(Y[j_start:, cat_mask]))
+
+        # Calculates the similarities for categorical columns
+        cat_dists = ((X[i, cat_mask] != Y[j_start:, cat_mask]) | cat_nan_cols)
+        # Calculates the Manhattan distances for numerical columns
+        num_dists = abs(X[i, num_mask] -
+                        Y[j_start:, num_mask]) / ranges
+
+        # Calculates the number of non missing columns
+        non_missing = X.shape[1] - (cat_nan_cols.sum(axis=1) +
+                                    _object_dtype_isnan(num_dists).sum(axis=1)
+                                    .astype(np.float32))
+
+        # This is to avoid ZeroDivisionError
+        non_missing[non_missing == 0] = np.nan
+
+        # Gets the final results
+        total = np.sum(cat_dists, axis=1) + np.sum(num_dists, axis=1)
+
+        results = total / non_missing
+
+        D[i, j_start:] = results
+        if X is Y:
+            D[i:, j_start] = results
+
+    return D
+
+
+def _detect_categorical_features(X, categorical_features=None):
+    """Identifies the numerical and non-numerical (categorical) columns
+    of an array.
+
+    Parameters
+    ----------
+    X : array-like, or pandas.DataFrame, shape (n_samples, n_features)
+
+    categorical_features : array-like, optional, shape (n_features)
+        Indicates with True/False whether a column is a categorical attribute.
+
+        Alternatively, the categorical_features array can be represented only
+        with the numerical indexes of the categorical attribtes.
+
+        If the categorical_features array is None, they will be automatically
+        detected in X. Numerical columns are identified as a subtype of
+        np.number, whilist categorical columns are not a subtype of np.number.
+
+    Returns
+    -------
+    categorical_features_mask : ndarray, shape (n_features)
+
+    """
+    # Automatic detection of categorical features
+    if categorical_features is None:
+        categorical_features = np.zeros(np.shape(X)[1], dtype=bool)
+
+        def detect_cat(x):
+            if not np.isnan(x):
+                if np.issubdtype(type(x), np.number):
+                    raise ValueError(False)
+                else:
+                    raise ValueError(True)
+
+        f_test = np.frompyfunc(detect_cat, 1, 1)
+        for col in range(np.shape(X)[1]):
+            try:
+                # This identifies categorical and numerical columns,
+                # A TypeError or ValueError(True) means it is a categorical
+                # column.
+
+                # This test was disabled because some deployments are returning
+                # nan instead of 0 in columns with nan values:
+                # if np.nansum(X[:, col]) > 0:
+                f_test(X[:, col])
+            except ValueError as e:
+                categorical_features[col] = e.args[0]
+            except TypeError:
+                categorical_features[col] = True
+    else:
+        categorical_features = np.asarray(categorical_features)
+        if np.issubdtype(categorical_features.dtype, np.integer):
+            new_categorical_features = np.zeros(np.shape(X)[1], dtype=bool)
+            new_categorical_features[categorical_features] = True
+            categorical_features = new_categorical_features
+    return categorical_features
+
+
+def _precompute_gower_params(X, Y, scale, num_mask):
+    """Precompute data-derived metric parameters for gower distances
+    """
+    X_num = X[:, num_mask].astype(np.float32)
+    min = np.nanmin(X_num, axis=0)
+    max = np.nanmax(X_num, axis=0)
+
+    if X is not Y and Y is not None:
+        Y_num = Y[:, num_mask].astype(np.float32)
+        min = np.minimum(np.nanmin(Y_num, axis=0), min)
+        max = np.maximum(np.nanmax(Y_num, axis=0), max)
+
+    if scale is None or type(scale) is bool:
+        scale = np.abs(max - min)
+    elif isinstance(scale, list):
+        scale = np.asarray(scale)
+
+    return scale, min, max
+
+
 # Paired distances
 def paired_euclidean_distances(X, Y):
     """
@@ -905,7 +1131,7 @@ def paired_cosine_distances(X, Y):
     'l2': paired_euclidean_distances,
     'l1': paired_manhattan_distances,
     'manhattan': paired_manhattan_distances,
-    'cityblock': paired_manhattan_distances}
+    'cityblock': paired_manhattan_distances, }
 
 
 def paired_distances(X, Y, metric="euclidean", **kwds):
@@ -1298,6 +1524,7 @@ def chi2_kernel(X, Y=None, gamma=1.):
     'l2': euclidean_distances,
     'l1': manhattan_distances,
     'manhattan': manhattan_distances,
+    'gower': gower_distances,
     'precomputed': None,  # HACK: precomputed is always allowed, never called
     'nan_euclidean': nan_euclidean_distances,
 }
@@ -1322,6 +1549,7 @@ def distance_metrics():
     'l1'            metrics.pairwise.manhattan_distances
     'l2'            metrics.pairwise.euclidean_distances
     'manhattan'     metrics.pairwise.manhattan_distances
+    'gower'          metrics.pairwise.gower_distances
     'nan_euclidean' metrics.pairwise.nan_euclidean_distances
     =============== ========================================
 
@@ -1400,7 +1628,7 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds):
                   'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
                   'russellrao', 'seuclidean', 'sokalmichener',
                   'sokalsneath', 'sqeuclidean', 'yule', "wminkowski",
-                  'nan_euclidean', 'haversine']
+                  'nan_euclidean', 'haversine', 'gower']
 
 _NAN_METRICS = ['nan_euclidean']
 
@@ -1429,6 +1657,19 @@ def _check_chunk_size(reduced, chunk_size):
 def _precompute_metric_params(X, Y, metric=None, **kwds):
     """Precompute data-derived metric parameters if not provided
     """
+    if metric == 'gower':
+        categorical_features = None
+        if 'categorical_features' in kwds:
+            categorical_features = kwds['categorical_features']
+
+        num_mask = ~ _detect_categorical_features(X, categorical_features)
+
+        scale = None
+        if 'scale' in kwds:
+            scale = kwds['scale']
+        scale, _, _ = _precompute_gower_params(X, Y, scale, num_mask)
+
+        return {'scale': scale}
     if metric == "seuclidean" and 'V' not in kwds:
         if X is Y:
             V = np.var(X, axis=0, ddof=1)
@@ -1721,6 +1962,15 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
         check_non_negative(X, whom=whom)
         return X
     elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
+        if metric == 'gower':
+            # These convertions are necessary for matrices with string values
+            if not isinstance(X, np.ndarray):
+                X = np.asarray(X, dtype=np.object)
+            if Y is not None and not isinstance(Y, np.ndarray):
+                Y = np.asarray(Y, dtype=np.object)
+            params = _precompute_metric_params(X, Y, metric=metric, **kwds)
+            kwds.update(**params)
+
         func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
     elif callable(metric):
         func = partial(_pairwise_callable, metric=metric,