Skip to content

Commit 16c1e27

Browse files
GregorKoehlerMaxBenChrist
authored andcommitted
added new transformer for the impute function (blue-yonder#210)
* added new transformer for the impute function * requested changes * remaining changes * added per_column_imputer to docs * structural changes to allow for more flexible presets, additional tests and line endings * small test name changes
1 parent dd8ec7d commit 16c1e27

File tree

4 files changed

+327
-0
lines changed

4 files changed

+327
-0
lines changed

docs/api/tsfresh.transformers.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,12 @@ relevant_feature_augmenter
2929
:members:
3030
:undoc-members:
3131
:show-inheritance:
32+
33+
per_column_imputer
34+
--------------------------
35+
36+
.. automodule:: tsfresh.transformers.per_column_imputer
37+
:members:
38+
:undoc-members:
39+
:show-inheritance:
40+
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
# -*- coding: utf-8 -*-
2+
# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
3+
# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016
4+
5+
from builtins import range
6+
from unittest import TestCase
7+
import pandas as pd
8+
import pandas.util.testing as pdt
9+
from sklearn.exceptions import NotFittedError
10+
11+
import numpy as np
12+
import numpy.testing as npt
13+
14+
from tsfresh.transformers.per_column_imputer import PerColumnImputer
15+
16+
17+
class PerColumnImputerTestCase(TestCase):
18+
def setUp(self):
19+
np.random.seed(0)
20+
21+
def test_not_fitted(self):
22+
imputer = PerColumnImputer()
23+
24+
X = pd.DataFrame()
25+
26+
self.assertRaises(NotFittedError, imputer.transform, X)
27+
28+
def test_only_nans_and_infs(self):
29+
imputer = PerColumnImputer()
30+
31+
X = pd.DataFrame(index=list(range(100)))
32+
33+
X["NaNs"] = np.nan * np.ones(100)
34+
X["PINF"] = np.PINF * np.ones(100)
35+
X["NINF"] = np.NINF * np.ones(100)
36+
37+
imputer.fit(X)
38+
selected_X = imputer.transform(X)
39+
40+
self.assertTrue((selected_X.values == 0).all())
41+
42+
def test_with_numpy_array(self):
43+
imputer = PerColumnImputer()
44+
45+
X = pd.DataFrame(index=list(range(100)))
46+
47+
X["NaNs"] = np.nan * np.ones(100)
48+
X["PINF"] = np.PINF * np.ones(100)
49+
X["NINF"] = np.NINF * np.ones(100)
50+
51+
X_numpy = X.values
52+
53+
imputer.fit(X)
54+
selected_X = imputer.transform(X)
55+
56+
#re-initialize for new dicts
57+
imputer = PerColumnImputer()
58+
imputer.fit(X_numpy)
59+
selected_X_numpy = imputer.transform(X_numpy)
60+
61+
npt.assert_array_equal(selected_X.values, selected_X_numpy.values)
62+
63+
self.assertTrue(selected_X_numpy.shape, (1, 100))
64+
65+
def test_standard_replacement_behavior(self):
66+
imputer = PerColumnImputer()
67+
68+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
69+
truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
70+
X = pd.DataFrame({"a": data})
71+
true_X = pd.DataFrame({"a": truth})
72+
73+
imputer.fit(X)
74+
selected_X = imputer.transform(X)
75+
76+
pdt.assert_frame_equal(selected_X, true_X)
77+
78+
def test_partial_preset_col_to_NINF_given(self):
79+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
80+
truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
81+
X = pd.DataFrame({"a": data})
82+
true_X = pd.DataFrame({"a": truth})
83+
84+
col_to_min = {"a": -100}
85+
imputer = PerColumnImputer(col_to_NINF_repl_preset=col_to_min)
86+
87+
imputer.fit(X)
88+
selected_X = imputer.transform(X)
89+
90+
pdt.assert_frame_equal(selected_X, true_X)
91+
92+
def test_partial_preset_col_to_PINF_given(self):
93+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
94+
truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
95+
X = pd.DataFrame({"a": data})
96+
true_X = pd.DataFrame({"a": truth})
97+
98+
col_to_max = {"a": 100}
99+
imputer = PerColumnImputer(col_to_PINF_repl_preset=col_to_max)
100+
101+
imputer.fit(X)
102+
selected_X = imputer.transform(X)
103+
104+
pdt.assert_frame_equal(selected_X, true_X)
105+
106+
def test_partial_preset_col_to_NAN_given(self):
107+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
108+
truth = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
109+
X = pd.DataFrame({"a": data})
110+
true_X = pd.DataFrame({"a": truth})
111+
112+
col_to_median = {"a": 1}
113+
imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
114+
115+
imputer.fit(X)
116+
selected_X = imputer.transform(X)
117+
118+
pdt.assert_frame_equal(selected_X, true_X)
119+
120+
def test_different_shapes_fitted_and_transformed(self):
121+
imputer = PerColumnImputer()
122+
123+
X = pd.DataFrame(index=list(range(10)))
124+
X["a"] = np.ones(10)
125+
126+
imputer.fit(X)
127+
X["b"] = np.ones(10)
128+
129+
self.assertRaises(ValueError, imputer.transform, X)
130+
131+
def test_preset_has_higher_priority_than_fit(self):
132+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
133+
truth = [-100.0, 100.0, 0.0, 100.0, -100.0, 1.0, 1.0]
134+
135+
X = pd.DataFrame({"a": data})
136+
true_X = pd.DataFrame({"a": truth})
137+
138+
col_to_median = {"a": 0}
139+
imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
140+
imputer.fit(X)
141+
142+
selected_X = imputer.transform(X)
143+
144+
pdt.assert_frame_equal(selected_X, true_X)
145+
146+
def test_only_parameters_of_last_fit_count(self):
147+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
148+
data_2 = [np.NINF, np.PINF, np.nan, 10.0, -10.0, 3.0, 3.0]
149+
truth_a = [-10.0, 10.0, 3.0, 10.0, -10.0, 3.0, 3.0]
150+
truth_b = [-10.0, 10.0, 3.0, 10.0, -10.0, 3.0, 3.0]
151+
152+
X = pd.DataFrame({"a": data, "b": data})
153+
X_2 = pd.DataFrame({"a": data_2, "b": data_2})
154+
true_X = pd.DataFrame({"a": truth_a, "b": truth_b})
155+
156+
imputer = PerColumnImputer()
157+
158+
imputer.fit(X)
159+
imputer.fit(X_2)
160+
161+
selected_X = imputer.transform(X_2)
162+
163+
pdt.assert_frame_equal(selected_X, true_X)
164+
165+
def test_only_subset_of_columns_given(self):
166+
data = [np.NINF, np.PINF, np.nan, 100.0, -100.0, 1.0, 1.0]
167+
truth_a = [-100.0, 100.0, 0.0, 100.0, -100.0, 1.0, 1.0]
168+
truth_b = [-100.0, 100.0, 1.0, 100.0, -100.0, 1.0, 1.0]
169+
X = pd.DataFrame({"a": data, "b":data})
170+
true_X = pd.DataFrame({"a":truth_a, "b":truth_b})
171+
172+
col_to_median = {"a": 0}
173+
imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
174+
175+
imputer.fit(X)
176+
selected_X = imputer.transform(X)
177+
178+
pdt.assert_frame_equal(selected_X,true_X)
179+
180+
def test_NINF_preset_contains_more_columns_than_dataframe_to_fit(self):
181+
X = pd.DataFrame(index=list(range(10)))
182+
X["a"] = np.ones(10)
183+
184+
col_to_min = {"a": 0, "b":0}
185+
186+
imputer = PerColumnImputer(col_to_NINF_repl_preset=col_to_min)
187+
188+
self.assertRaises(ValueError, imputer.fit, X)
189+
190+
def test_PINF_preset_contains_more_columns_than_dataframe_to_fit(self):
191+
X = pd.DataFrame(index=list(range(10)))
192+
X["a"] = np.ones(10)
193+
194+
col_to_max = {"a": 0, "b":0}
195+
196+
imputer = PerColumnImputer(col_to_PINF_repl_preset=col_to_max)
197+
198+
self.assertRaises(ValueError, imputer.fit, X)
199+
200+
def test_NAN_preset_contains_more_columns_than_dataframe_to_fit(self):
201+
X = pd.DataFrame(index=list(range(10)))
202+
X["a"] = np.ones(10)
203+
204+
col_to_median = {"a": 0, "b":0}
205+
206+
imputer = PerColumnImputer(col_to_NAN_repl_preset=col_to_median)
207+
208+
self.assertRaises(ValueError, imputer.fit, X)

tsfresh/transformers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
from tsfresh.transformers.feature_augmenter import FeatureAugmenter
77
from tsfresh.transformers.feature_selector import FeatureSelector
88
from tsfresh.transformers.relevant_feature_augmenter import RelevantFeatureAugmenter
9+
from tsfresh.transformers.per_column_imputer import PerColumnImputer
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# -*- coding: utf-8 -*-
2+
# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
3+
# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016
4+
5+
from sklearn.base import BaseEstimator, TransformerMixin
6+
from sklearn.exceptions import NotFittedError
7+
from tsfresh.utilities.dataframe_functions import get_range_values_per_column, impute_dataframe_range
8+
import pandas as pd
9+
10+
class PerColumnImputer(BaseEstimator, TransformerMixin):
11+
"""
12+
Sklearn-compatible estimator, for column-wise imputing DataFrames by replacing all ``NaNs`` and ``infs``
13+
with with average/extreme values from the same columns. It is basically a wrapper around
14+
:func:`~tsfresh.utilities.dataframe_functions.impute`.
15+
16+
Each occurring ``inf`` or ``NaN`` in the DataFrame is replaced by
17+
18+
* ``-inf`` -> ``min``
19+
* ``+inf`` -> ``max``
20+
* ``NaN`` -> ``median``
21+
22+
This estimator - as most of the sklearn estimators - works in a two step procedure. First, the ``.fit``
23+
function is called where for each column the min, max and median are computed.
24+
Secondly, the ``.transform`` function is called which replaces the occurances of ``NaNs`` and ``infs`` using
25+
the column-wise computed min, max and median values.
26+
"""
27+
def __init__(self, col_to_NINF_repl_preset=None, col_to_PINF_repl_preset=None, col_to_NAN_repl_preset=None):
28+
"""
29+
Create a new PerColumnImputer instance, optionally with dictionaries containing replacements for
30+
``NaNs`` and ``infs``.
31+
32+
:param col_to_NINF_repl: Dictionary mapping column names to ``-inf`` replacement values
33+
:type col_to_NINF_repl: dict
34+
:param col_to_PINF_repl: Dictionary mapping column names to ``+inf`` replacement values
35+
:type col_to_PINF_repl: dict
36+
:param col_to_NAN_repl: Dictionary mapping column names to ``NaN`` replacement values
37+
:type col_to_NAN_repl: dict
38+
"""
39+
self._col_to_NINF_repl = None
40+
self._col_to_PINF_repl = None
41+
self._col_to_NAN_repl = None
42+
self.col_to_NINF_repl_preset = col_to_NINF_repl_preset
43+
self.col_to_PINF_repl_preset = col_to_PINF_repl_preset
44+
self.col_to_NAN_repl_preset = col_to_NAN_repl_preset
45+
46+
def fit(self, X, y=None):
47+
"""
48+
Compute the min, max and median for all columns in the DataFrame. For more information,
49+
please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function.
50+
51+
:param X: DataFrame to calculate min, max and median values on
52+
:type X: pandas.DataFrame
53+
:param y: Unneeded.
54+
:type y: Any
55+
56+
:return: the estimator with the computed min, max and median values
57+
:rtype: Imputer
58+
"""
59+
if not isinstance(X, pd.DataFrame):
60+
X = pd.DataFrame(X)
61+
62+
col_to_max, col_to_min, col_to_median = get_range_values_per_column(X)
63+
64+
if self.col_to_NINF_repl_preset is not None:
65+
if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()):
66+
raise ValueError("Preset dictionary 'col_to_NINF_repl_preset' contain more keys "
67+
"than the column names in X")
68+
col_to_min.update(self.col_to_NINF_repl_preset)
69+
self._col_to_NINF_repl = col_to_min
70+
71+
if self.col_to_PINF_repl_preset is not None:
72+
if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()):
73+
raise ValueError("Preset dictionary 'col_to_PINF_repl_preset' contain more keys "
74+
"than the column names in X")
75+
col_to_max.update(self.col_to_PINF_repl_preset)
76+
self._col_to_PINF_repl = col_to_max
77+
78+
if self.col_to_NAN_repl_preset is not None:
79+
if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()):
80+
raise ValueError("Preset dictionary 'col_to_NAN_repl_preset' contain more keys "
81+
"than the column names in X")
82+
col_to_median.update(self.col_to_NAN_repl_preset)
83+
self._col_to_NAN_repl = col_to_median
84+
85+
return self
86+
87+
def transform(self, X):
88+
"""
89+
Column-wise replace all ``NaNs``, ``-inf`` and ``+inf`` in the DataFrame `X` with average/extreme
90+
values from the provided dictionaries.
91+
92+
:param X: DataFrame to impute
93+
:type X: pandas.DataFrame
94+
95+
:return: imputed DataFrame
96+
:rtype: pandas.DataFrame
97+
:raise RuntimeError: if the replacement dictionaries are still of None type.
98+
This can happen if the transformer was not fitted.
99+
"""
100+
101+
if not isinstance(X, pd.DataFrame):
102+
X = pd.DataFrame(X)
103+
104+
if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None:
105+
raise NotFittedError("PerColumnImputer is not fitted")
106+
107+
X = impute_dataframe_range(X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl)
108+
109+
return X

0 commit comments

Comments
 (0)