scikit-learn · arjunjauhari · Mar 28, 2017 · Jun 24, 2017 · Jun 25, 2017 · Nov 29, 2017
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1201,6 +1201,7 @@ Model validation
    preprocessing.Normalizer
    preprocessing.OneHotEncoder
    preprocessing.CategoricalEncoder
+   preprocessing.UnaryEncoder
    preprocessing.PolynomialFeatures
    preprocessing.PowerTransformer
    preprocessing.QuantileTransformer

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -589,6 +589,66 @@ columns for this feature will be all zeros
 See :ref:`dict_feature_extraction` for categorical features that are represented
 as a dict, not as scalars.
 
+.. _preprocessing_ordinal_features:
+
+Encoding ordinal features
+=============================
+Often categorical features have a clear ordering. For example a person could
+have features
+
+* ``["short", "tall"]``
+* ``["low income", "medium income", "high income"]``
+* ``["elementary school graduate", "high school graduate", "some college",
+   "college graduate"]``
+
+Even though these features can be ordered, we shouldn't necessarily assign
+scores to them, as the difference between categories one and two is not the
+same as the difference between categories two and three.
+
+One possibility to convert these ordinal features to features that can be used
+with scikit-learn estimators is to use a unary encoding, which is
+implemented in :class:`UnaryEncoder`.  This estimator transforms each
+ordinal feature with ``m`` possible values into ``m - 1`` binary features,
+where the ith feature is active if x > i (for i = 0, ... k - 1).
+
+.. note::
+
+  This encoding is likely to help when used with linear models and
+  kernel-based models like SVMs with the standard kernels. On the other hand, this
+  transformation is unlikely to help when using with tree-based models,
+  since those already work on the basis of a particular feature value being
+  < or > than a threshold, unlike linear and kernel-based models.
+
+Continuing the example above::
+
+  >>> enc = preprocessing.UnaryEncoder()
+  >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS
+  UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn',
+         n_values='auto', ordinal_features='all', sparse=False)
+  >>> enc.transform([[0, 1, 1]])
+  array([[ 0.,  1.,  0.,  1.,  0.,  0.]])
+
+By default, how many values each feature can take is inferred automatically
+from the dataset. It is possible to specify this explicitly using the parameter
+``n_values``. 
+* There are two genders, three possible continents and four web browsers in our
+  dataset.
+* Then we fit the estimator, and transform a data point.
+* In the result, the first number encodes the height, the next two numbers the
+  income level, and the next set of three numbers the education level.
+
+Note that, if there is a possibilty that the training data might have missing
+categorical features, one has to explicitly set ``n_values``. For example,::
+
+  >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4])
+  >>> # Note that there are missing categorical values for the 2nd and 3rd
+  >>> # features
+  >>> enc.fit([[1, 2, 3], [0, 2, 0]])  # doctest: +ELLIPSIS
+  UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn',
+         n_values=[2, 3, 4], ordinal_features='all', sparse=False)
+  >>> enc.transform([[1, 1, 2]])
+  array([[ 1.,  1.,  0.,  1.,  1.,  0.]])
+
 .. _imputation:
 
 Imputation of missing values

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -26,6 +26,7 @@
 from .data import PowerTransformer
 from .data import CategoricalEncoder
 from .data import PolynomialFeatures
+from .data import UnaryEncoder
 
 from .label import label_binarize
 from .label import LabelBinarizer
@@ -65,4 +66,5 @@
     'label_binarize',
     'quantile_transform',
     'power_transform',
+    'UnaryEncoder'
 ]
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -62,6 +62,7 @@
     'minmax_scale',
     'quantile_transform',
     'power_transform',
+    'UnaryEncoder'
 ]
 
 
@@ -1957,6 +1958,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
       matrix indicating the presence of a class label.
     sklearn.preprocessing.LabelEncoder : encodes labels with values between 0
       and n_classes-1.
+    sklearn.preprocessing.UnaryEncoder: encodes ordinal integer features
+      using a unary scheme.
     """
     def __init__(self, n_values="auto", categorical_features="all",
                  dtype=np.float64, sparse=True, handle_unknown='error'):
@@ -2064,8 +2067,8 @@ def _transform(self, X):
         mask = (X < self.n_values_).ravel()
         if np.any(~mask):
             if self.handle_unknown not in ['error', 'ignore']:
-                raise ValueError("handle_unknown should be either error or "
-                                 "unknown got %s" % self.handle_unknown)
+                raise ValueError("handle_unknown should be either 'error' or "
+                                 "'ignore' got %s" % self.handle_unknown)
             if self.handle_unknown == 'error':
                 raise ValueError("unknown categorical feature present %s "
                                  "during transform." % X.ravel()[~mask])
@@ -3147,3 +3150,216 @@ def inverse_transform(self, X):
                     X_tr[mask, idx] = None
 
         return X_tr
+
+
+class UnaryEncoder(BaseEstimator, TransformerMixin):
+    """Encode ordinal integer features using a unary scheme.
+
+    The input to this transformer should be a matrix of non-negative integers,
+    denoting the values taken on by ordinal (discrete) features. The output
+    will be a matrix where each column corresponds to one possible value of
+    one feature. It is assumed that input features take on values in the range
+    0 to (n_values - 1).
+
+    This encoding is needed for feeding ordinal features to many scikit-learn
+    estimators, notably linear models and kernel-based models like SVMs with
+    the standard kernels.
+    This transformation is unlikely to help when using with tree-based models,
+    since those already work on the basis of a particular feature value being
+    < or > than a threshold, unlike linear and kernel-based models.
+
+    Read more in the :ref:`User Guide <preprocessing_ordinal_features>`.
+
+    Parameters
+    ----------
+    n_values : 'auto', int or array of ints
+        Number of values per feature.
+
+        - 'auto' : determine value range from training data.
+        - int : number of ordinal values per feature.
+                Each feature value should be in ``range(n_values)``
+        - array : ``n_values[i]`` is the number of ordinal values in
+                  ``X[:, i]``. Each feature value should be
+                  in ``range(n_values[i])``
+
+    ordinal_features : "all" or array of indices or mask
+        Specify what features are treated as ordinal.
+
+        - 'all' (default): All features are treated as ordinal.
+        - array of indices: Array of ordinal feature indices.
+        - mask: Array of length n_features and with dtype=bool.
+
+        Non-ordinal features are always stacked to the right of the matrix.
+
+    dtype : number type, default=np.float
+        Desired dtype of output.
+
+    sparse : boolean, default=False
+        Will return sparse matrix if set True else will return an array.
+
+    handle_greater : str, 'warn' or 'error' or 'clip'
+        Whether to raise an error or clip or warn if an
+        ordinal feature >= n_values is passed in.
+
+        - 'warn' (default): same as clip but with warning.
+        - 'error': raise error if feature >= n_values is passed in.
+        - 'clip': all the feature values >= n_values are clipped to
+                  (n_values-1) during transform.
+
+    Attributes
+    ----------
+    feature_indices_ : array of shape (n_features,)
+        Indices to feature ranges.
+        Feature ``i`` in the original data is mapped to features
+        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
+
+    n_values_ : array of shape (n_features,)
+        Maximum number of values per feature.
+
+    Examples
+    --------
+    Given a dataset with three features and four samples, we let the encoder
+    find the maximum value per feature and transform the data to a binary
+    unary encoding.
+
+    >>> from sklearn.preprocessing import UnaryEncoder
+    >>> enc = UnaryEncoder()
+    >>> enc.fit([[0, 0, 3],
+    ...          [1, 1, 0],
+    ...          [0, 2, 1],
+    ...          [1, 0, 2]])  # doctest: +ELLIPSIS
+    UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn',
+           n_values='auto', ordinal_features='all', sparse=False)
+    >>> enc.n_values_
+    array([2, 3, 4])
+    >>> enc.feature_indices_
+    array([0, 1, 3, 6])
+    >>> enc.transform([[0, 1, 2]])
+    array([[ 0.,  1.,  0.,  1.,  1.,  0.]])
+
+    See also
+    --------
+    sklearn.preprocessing.OneHotEncoder: encodes categorical integer features
+      using a one-hot aka one-of-K scheme.
+    """
+    def __init__(self, n_values="auto", ordinal_features="all",
+                 dtype=np.float64, sparse=False, handle_greater='warn'):
+        self.n_values = n_values
+        self.ordinal_features = ordinal_features
+        self.dtype = dtype
+        self.sparse = sparse
+        self.handle_greater = handle_greater
+
+    def fit(self, X, y=None):
+        """Fit UnaryEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_feature]
+            Input array of type int.
+            All feature values should be non-negative otherwise will raise a
+            ValueError.
+        """
+        _transform_selected(X, self._fit, self.ordinal_features, copy=True)
+        return self
+
+    def _fit(self, X):
+        """Assumes X contains only ordinal features."""
+        X = check_array(X, dtype=np.int)
+        if self.handle_greater not in ['warn', 'error', 'clip']:
+            raise ValueError("handle_greater should be either 'warn', 'error' "
+                             "or 'clip' got %s" % self.handle_greater)
+        if np.any(X < 0):
+            raise ValueError("X needs to contain only non-negative integers.")
+        n_samples, n_features = X.shape
+
+        if (isinstance(self.n_values, six.string_types) and
+                self.n_values == 'auto'):
+            n_values = np.max(X, axis=0) + 1
+        elif isinstance(self.n_values, numbers.Integral):
+            n_values = np.empty(n_features, dtype=np.int)
+            n_values.fill(self.n_values)
+        else:
+            try:
+                n_values = np.asarray(self.n_values, dtype=int)
+            except (ValueError, TypeError):
+                raise TypeError("Wrong type for parameter `n_values`. Expected"
+                                " 'auto', int or array of ints, got %r"
+                                % self.n_values)
+            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
+                raise ValueError("Shape mismatch: if n_values is an array,"
+                                 " it has to be of shape (n_features,).")
+
+        self.n_values_ = n_values
+        n_values = np.hstack([[0], n_values - 1])
+        indices = np.cumsum(n_values)
+        self.feature_indices_ = indices
+
+        mask = (X >= self.n_values_).ravel()
+        if np.any(mask):
+            if self.handle_greater == 'error':
+                raise ValueError("handle_greater='error' but found %d feature"
+                                 " values which exceeds n_values."
+                                 % np.count_nonzero(mask))
+
+        return X
+
+    def _transform(self, X):
+        """Assumes X contains only ordinal features."""
+        X = check_array(X, dtype=np.int)
+        if np.any(X < 0):
+            raise ValueError("X needs to contain only non-negative integers.")
+        n_samples, n_features = X.shape
+
+        indices = self.feature_indices_
+        if n_features != indices.shape[0] - 1:
+            raise ValueError("X has different shape than during fitting."
+                             " Expected %d, got %d."
+                             % (indices.shape[0] - 1, n_features))
+
+        # We clip those ordinal features of X that are greater than n_values_
+        # using mask if self.handle_greater is "clip".
+        # This means, the row_indices and col_indices corresponding to the
+        # greater ordinal feature are all filled with ones.
+        mask = (X >= self.n_values_).ravel()
+        if np.any(mask):
+            if self.handle_greater == 'warn':
+                warnings.warn("Found %d feature values which exceeds "
+                              "n_values during transform, clipping them."
+                              % np.count_nonzero(mask))
+            elif self.handle_greater == 'error':
+                raise ValueError("handle_greater='error' but found %d feature"
+                                 " values which exceeds n_values during "
+                                 "transform." % np.count_nonzero(mask))
+
+        X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X)
+        column_start = np.tile(indices[:-1], n_samples)
+        column_end = (indices[:-1] + X_ceil).ravel()
+        column_indices = np.hstack([np.arange(s, e) for s, e
+                                   in zip(column_start, column_end)])
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                X_ceil.sum(axis=1))
+        data = np.ones(X_ceil.ravel().sum())
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+
+        return out if self.sparse else out.toarray()
+
+    def transform(self, X):
+        """Transform X using Ordinal encoding.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            Input array of type int.
+            All feature values should be non-negative otherwise will raise a
+            ValueError.
+
+        Returns
+        -------
+        X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
+            Transformed input.
+        """
+        return _transform_selected(X, self._transform,
+                                   self.ordinal_features, copy=True)