added new transformer for the impute function (blue-yonder#210)

GregorKoehler · MaxBenChrist · commit 16c1e2776e5b · 2017-05-31T17:00:27.000+02:00
* added new transformer for the impute function

* requested changes

* remaining changes

* added per_column_imputer to docs

* structural changes to allow for more flexible presets, additional tests and line endings

* small test name changes
diff --git a/docs/api/tsfresh.transformers.rst b/docs/api/tsfresh.transformers.rst
@@ -29,3 +29,12 @@ relevant_feature_augmenter
     :members:
     :undoc-members:
     :show-inheritance:
+
+per_column_imputer
+--------------------------
+
+.. automodule:: tsfresh.transformers.per_column_imputer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/tests/transformers/test_per_column_imputer.py b/tests/transformers/test_per_column_imputer.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
+# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016
+
+from builtins import range
+from unittest import TestCase
+import pandas as pd
+import pandas.util.testing as pdt
+from sklearn.exceptions import NotFittedError
+
+import numpy as np
+import numpy.testing as npt
+
+from tsfresh.transformers.per_column_imputer import PerColumnImputer
+
+
+class PerColumnImputerTestCase(TestCase):
+    def setUp(self):
+        np.random.seed(0)
+
+    def test_not_fitted(self):
+        imputer = PerColumnImputer()
+
+        X = pd.DataFrame()
+
+        self.assertRaises(NotFittedError, imputer.transform, X)
+
+    def test_only_nans_and_infs(self):
+        imputer = PerColumnImputer()
+
+        X = pd.DataFrame(index=list(range(100)))
+
+        X["NaNs"] = np.nan * np.ones(100)
+        X["PINF"] = np.PINF * np.ones(100)
+        X["NINF"] = np.NINF * np.ones(100)
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        self.assertTrue((selected_X.values == 0).all())
+
+    def test_with_numpy_array(self):
+        imputer = PerColumnImputer()
+
+        X = pd.DataFrame(index=list(range(100)))
+
+        X["NaNs"] = np.nan * np.ones(100)
+        X["PINF"] = np.PINF * np.ones(100)
+        X["NINF"] = np.NINF * np.ones(100)
+
+        X_numpy = X.values
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        #re-initialize for new dicts
+        imputer = PerColumnImputer()
+        imputer.fit(X_numpy)
+        selected_X_numpy = imputer.transform(X_numpy)
+
+        npt.assert_array_equal(selected_X.values, selected_X_numpy.values)
+
+        self.assertTrue(selected_X_numpy.shape, (1, 100))
+
+    def test_standard_replacement_behavior(self):
+        imputer = PerColumnImputer()
+
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
+        X = pd.DataFrame({"a": data})
+        true_X = pd.DataFrame({"a": truth})
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        pdt.assert_frame_equal(selected_X, true_X)
+
+    def test_partial_preset_col_to_NINF_given(self):
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
+        X = pd.DataFrame({"a": data})
+        true_X = pd.DataFrame({"a": truth})
+
+        col_to_min = {"a": -100}
+        imputer = PerColumnImputer(col_to_NINF_repl_preset=col_to_min)
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        pdt.assert_frame_equal(selected_X, true_X)
+
+    def test_partial_preset_col_to_PINF_given(self):
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
+        X = pd.DataFrame({"a": data})
+        true_X = pd.DataFrame({"a": truth})
+
+        col_to_max = {"a": 100}
+        imputer = PerColumnImputer(col_to_PINF_repl_preset=col_to_max)
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        pdt.assert_frame_equal(selected_X, true_X)
+
+    def test_partial_preset_col_to_NAN_given(self):
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
+        X = pd.DataFrame({"a": data})
+        true_X = pd.DataFrame({"a": truth})
+
+        col_to_median = {"a": 1}
+        imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        pdt.assert_frame_equal(selected_X, true_X)
+
+    def test_different_shapes_fitted_and_transformed(self):
+        imputer = PerColumnImputer()
+
+        X = pd.DataFrame(index=list(range(10)))
+        X["a"] = np.ones(10)
+
+        imputer.fit(X)
+        X["b"] = np.ones(10)
+
+        self.assertRaises(ValueError, imputer.transform, X)
+
+    def test_preset_has_higher_priority_than_fit(self):
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        truth = [-100.0, 100.0, 0.0, 100.0, -100.0, 1.0, 1.0]
+
+        X = pd.DataFrame({"a": data})
+        true_X = pd.DataFrame({"a": truth})
+
+        col_to_median = {"a": 0}
+        imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
+        imputer.fit(X)
+
+        selected_X = imputer.transform(X)
+
+        pdt.assert_frame_equal(selected_X, true_X)
+
+    def test_only_parameters_of_last_fit_count(self):
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        data_2 = [np.NINF, np.PINF, np.nan, 10.0, -10.0, 3.0, 3.0]
+        truth_a = [-10.0, 10.0, 3.0, 10.0, -10.0, 3.0, 3.0]
+        truth_b = [-10.0, 10.0, 3.0, 10.0, -10.0, 3.0, 3.0]
+
+        X = pd.DataFrame({"a": data, "b": data})
+        X_2 = pd.DataFrame({"a": data_2, "b": data_2})
+        true_X = pd.DataFrame({"a": truth_a, "b": truth_b})
+
+        imputer = PerColumnImputer()
+
+        imputer.fit(X)
+        imputer.fit(X_2)
+
+        selected_X = imputer.transform(X_2)
+
+        pdt.assert_frame_equal(selected_X, true_X)
+
+    def test_only_subset_of_columns_given(self):
+        data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
+        truth_a = [-100.0, 100.0, 0.0, 100.0, -100.0, 1.0, 1.0]
+        truth_b = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
+        X = pd.DataFrame({"a": data, "b":data})
+        true_X = pd.DataFrame({"a":truth_a, "b":truth_b})
+
+        col_to_median = {"a": 0}
+        imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
+
+        imputer.fit(X)
+        selected_X = imputer.transform(X)
+
+        pdt.assert_frame_equal(selected_X,true_X)
+
+    def test_NINF_preset_contains_more_columns_than_dataframe_to_fit(self):
+        X = pd.DataFrame(index=list(range(10)))
+        X["a"] = np.ones(10)
+
+        col_to_min = {"a": 0, "b":0}
+
+        imputer = PerColumnImputer(col_to_NINF_repl_preset=col_to_min)
+
+        self.assertRaises(ValueError, imputer.fit, X)
+
+    def test_PINF_preset_contains_more_columns_than_dataframe_to_fit(self):
+        X = pd.DataFrame(index=list(range(10)))
+        X["a"] = np.ones(10)
+
+        col_to_max = {"a": 0, "b":0}
+
+        imputer = PerColumnImputer(col_to_PINF_repl_preset=col_to_max)
+
+        self.assertRaises(ValueError, imputer.fit, X)
+
+    def test_NAN_preset_contains_more_columns_than_dataframe_to_fit(self):
+        X = pd.DataFrame(index=list(range(10)))
+        X["a"] = np.ones(10)
+
+        col_to_median = {"a": 0, "b":0}
+
+        imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
+
+        self.assertRaises(ValueError, imputer.fit, X)
diff --git a/tsfresh/transformers/__init__.py b/tsfresh/transformers/__init__.py
@@ -6,3 +6,4 @@
 from tsfresh.transformers.feature_augmenter import FeatureAugmenter
 from tsfresh.transformers.feature_selector import FeatureSelector
 from tsfresh.transformers.relevant_feature_augmenter import RelevantFeatureAugmenter
+from tsfresh.transformers.per_column_imputer import PerColumnImputer
diff --git a/tsfresh/transformers/per_column_imputer.py b/tsfresh/transformers/per_column_imputer.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
+# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.exceptions import NotFittedError
+from tsfresh.utilities.dataframe_functions import get_range_values_per_column, impute_dataframe_range
+import pandas as pd
+
+class PerColumnImputer(BaseEstimator, TransformerMixin):
+        """
+        Sklearn-compatible estimator, for column-wise imputing DataFrames by replacing all ``NaNs`` and ``infs``
+        with with average/extreme values from the same columns. It is basically a wrapper around
+        :func:`~tsfresh.utilities.dataframe_functions.impute`.
+
+        Each occurring ``inf`` or ``NaN`` in the DataFrame is replaced by
+
+        * ``-inf`` -> ``min``
+        * ``+inf`` -> ``max``
+        * ``NaN`` -> ``median``
+
+        This estimator - as most of the sklearn estimators - works in a two step procedure. First, the ``.fit``
+        function is called where for each column the min, max and median are computed.
+        Secondly, the ``.transform`` function is called which replaces the occurances of ``NaNs`` and ``infs`` using
+        the column-wise computed min, max and median values.
+        """
+        def __init__(self, col_to_NINF_repl_preset=None, col_to_PINF_repl_preset=None, col_to_NAN_repl_preset=None):
+            """
+            Create a new PerColumnImputer instance, optionally with dictionaries containing replacements for
+            ``NaNs`` and ``infs``.
+
+            :param col_to_NINF_repl: Dictionary mapping column names to ``-inf`` replacement values
+            :type col_to_NINF_repl: dict
+            :param col_to_PINF_repl: Dictionary mapping column names to ``+inf`` replacement values
+            :type col_to_PINF_repl: dict
+            :param col_to_NAN_repl: Dictionary mapping column names to ``NaN`` replacement values
+            :type col_to_NAN_repl: dict
+            """
+            self._col_to_NINF_repl = None
+            self._col_to_PINF_repl = None
+            self._col_to_NAN_repl = None
+            self.col_to_NINF_repl_preset = col_to_NINF_repl_preset
+            self.col_to_PINF_repl_preset = col_to_PINF_repl_preset
+            self.col_to_NAN_repl_preset = col_to_NAN_repl_preset
+
+        def fit(self, X, y=None):
+            """
+            Compute the min, max and median for all columns in the DataFrame. For more information,
+            please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function.
+
+            :param X: DataFrame to calculate min, max and median values on
+            :type X: pandas.DataFrame
+            :param y: Unneeded.
+            :type y: Any
+
+            :return: the estimator with the computed min, max and median values
+            :rtype: Imputer
+            """
+            if not isinstance(X, pd.DataFrame):
+                X = pd.DataFrame(X)
+
+            col_to_max, col_to_min, col_to_median = get_range_values_per_column(X)
+
+            if self.col_to_NINF_repl_preset is not None:
+                if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()):
+                    raise ValueError("Preset dictionary 'col_to_NINF_repl_preset' contain more keys "
+                                     "than the column names in X")
+                col_to_min.update(self.col_to_NINF_repl_preset)
+            self._col_to_NINF_repl = col_to_min
+
+            if self.col_to_PINF_repl_preset is not None:
+                if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()):
+                    raise ValueError("Preset dictionary 'col_to_PINF_repl_preset' contain more keys "
+                                     "than the column names in X")
+                col_to_max.update(self.col_to_PINF_repl_preset)
+            self._col_to_PINF_repl = col_to_max
+
+            if self.col_to_NAN_repl_preset is not None:
+                if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()):
+                    raise ValueError("Preset dictionary 'col_to_NAN_repl_preset' contain more keys "
+                                     "than the column names in X")
+                col_to_median.update(self.col_to_NAN_repl_preset)
+            self._col_to_NAN_repl = col_to_median
+
+            return self
+
+        def transform(self, X):
+            """
+            Column-wise replace all ``NaNs``, ``-inf`` and ``+inf`` in the DataFrame `X` with average/extreme
+            values from the provided dictionaries.
+
+            :param X: DataFrame to impute
+            :type X: pandas.DataFrame
+
+            :return: imputed DataFrame
+            :rtype: pandas.DataFrame
+            :raise RuntimeError: if the replacement dictionaries are still of None type.
+             This can happen if the transformer was not fitted.
+            """
+
+            if not isinstance(X, pd.DataFrame):
+                X = pd.DataFrame(X)
+
+            if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None:
+                raise NotFittedError("PerColumnImputer is not fitted")
+
+            X = impute_dataframe_range(X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl)
+
+            return X