Maximum rolling (blue-yonder#184)

nils-braun · MaxBenChrist · commit cd66c66c3a24 · 2017-04-02T23:17:31.000+02:00
* Added a maximum number of shifts parameter

* Added a test for this

* Make documention more precise.

* Added more tests
diff --git a/tests/utilities/test_dataframe_functions.py b/tests/utilities/test_dataframe_functions.py
@@ -213,22 +213,53 @@ def test_positive_rolling(self):
 
         df_full = pd.concat([first_class, second_class], ignore_index=True)
 
-        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
-                                                  column_kind=None, rolling_direction=1)
-
         correct_indices = (["id=1, shift=3"] * 1 +
                            ["id=1, shift=2"] * 2 +
                            ["id=1, shift=1"] * 3 +
                            ["id=2, shift=1"] * 1 +
                            ["id=1, shift=0"] * 4 +
                            ["id=2, shift=0"] * 2)
+        correct_values_a = [1, 1, 2, 1, 2, 3, 10, 1, 2, 3, 4, 10, 11]
+        correct_values_b = [5, 5, 6, 5, 6, 7, 12, 5, 6, 7, 8, 12, 13]
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=1)
+
+        self.assertListEqual(list(df["id"]), correct_indices)
+        self.assertListEqual(list(df["a"].values), correct_values_a)
+        self.assertListEqual(list(df["b"].values), correct_values_b)
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=1,
+                                                  maximum_number_of_timeshifts=None)
 
         self.assertListEqual(list(df["id"]), correct_indices)
+        self.assertListEqual(list(df["a"].values), correct_values_a)
+        self.assertListEqual(list(df["b"].values), correct_values_b)
 
-        self.assertListEqual(list(df["a"].values),
-                             [1, 1, 2, 1, 2, 3, 10, 1, 2, 3, 4, 10, 11])
-        self.assertListEqual(list(df["b"].values),
-                             [5, 5, 6, 5, 6, 7, 12, 5, 6, 7, 8, 12, 13])
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=1,
+                                                  maximum_number_of_timeshifts=1)
+
+        self.assertListEqual(list(df["id"]), correct_indices[3:])
+        self.assertListEqual(list(df["a"].values), correct_values_a[3:])
+        self.assertListEqual(list(df["b"].values), correct_values_b[3:])
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=1,
+                                                  maximum_number_of_timeshifts=2)
+
+        self.assertListEqual(list(df["id"]), correct_indices[1:])
+        self.assertListEqual(list(df["a"].values), correct_values_a[1:])
+        self.assertListEqual(list(df["b"].values), correct_values_b[1:])
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=1,
+                                                  maximum_number_of_timeshifts=4)
+
+        self.assertListEqual(list(df["id"]), correct_indices[:])
+        self.assertListEqual(list(df["a"].values), correct_values_a[:])
+        self.assertListEqual(list(df["b"].values), correct_values_b[:])
 
     def test_negative_rolling(self):
         first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
@@ -239,22 +270,53 @@ def test_negative_rolling(self):
 
         df_full = pd.concat([first_class, second_class], ignore_index=True)
 
-        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
-                                                  column_kind=None, rolling_direction=-1)
-
         correct_indices = (["id=1, shift=-3"] * 1 +
                            ["id=1, shift=-2"] * 2 +
                            ["id=1, shift=-1"] * 3 +
                            ["id=2, shift=-1"] * 1 +
                            ["id=1, shift=0"] * 4 +
                            ["id=2, shift=0"] * 2)
+        correct_values_a = [4, 3, 4, 2, 3, 4, 11, 1, 2, 3, 4, 10, 11]
+        correct_values_b = [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13]
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=-1)
 
         self.assertListEqual(list(df["id"].values), correct_indices)
+        self.assertListEqual(list(df["a"].values), correct_values_a)
+        self.assertListEqual(list(df["b"].values), correct_values_b)
 
-        self.assertListEqual(list(df["a"].values),
-                             [4, 3, 4, 2, 3, 4, 11, 1, 2, 3, 4, 10, 11])
-        self.assertListEqual(list(df["b"].values),
-                             [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13])
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=-1,
+                                                  maximum_number_of_timeshifts=None)
+
+        self.assertListEqual(list(df["id"].values), correct_indices)
+        self.assertListEqual(list(df["a"].values), correct_values_a)
+        self.assertListEqual(list(df["b"].values), correct_values_b)
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=-1,
+                                                  maximum_number_of_timeshifts=1)
+
+        self.assertListEqual(list(df["id"].values), correct_indices[3:])
+        self.assertListEqual(list(df["a"].values), correct_values_a[3:])
+        self.assertListEqual(list(df["b"].values), correct_values_b[3:])
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=-1,
+                                                  maximum_number_of_timeshifts=2)
+
+        self.assertListEqual(list(df["id"].values), correct_indices[1:])
+        self.assertListEqual(list(df["a"].values), correct_values_a[1:])
+        self.assertListEqual(list(df["b"].values), correct_values_b[1:])
+
+        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
+                                                  column_kind=None, rolling_direction=-1,
+                                                  maximum_number_of_timeshifts=4)
+
+        self.assertListEqual(list(df["id"].values), correct_indices[:])
+        self.assertListEqual(list(df["a"].values), correct_values_a[:])
+        self.assertListEqual(list(df["b"].values), correct_values_b[:])
 
     def test_stacked_rolling(self):
         first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
@@ -309,8 +371,6 @@ def test_dict_rolling(self):
         self.assertListEqual(list(df["b"]["_value"].values),
                              [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13])
 
-
-
     def test_warning_on_non_uniform_time_steps(self):
         with warnings.catch_warnings(record=True) as w:
             first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5]})
diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py
@@ -328,7 +328,8 @@ def normalize_input_to_internal_representation(df_or_dict, column_id, column_sor
     return kind_to_df_map, column_id, column_value
 
 
-def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_direction):
+def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_direction,
+                     maximum_number_of_timeshifts=None):
     """
     Roll the (sorted) data frames for each kind and each id separately in the "time" domain
     (which is represented by the sort order of the sort column given by `column_sort`).
@@ -360,6 +361,9 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di
     :type column_kind: basestring or None
     :param rolling_direction: The sign decides, if to roll backwards or forwards in "time"
     :type rolling_direction: int
+    :param maximum_number_of_timeshifts: If not None, shift only up to maximum_number_of_timeshifts.
+        If None, shift as often as possible.
+    :type maximum_number_of_timeshifts: int
 
     :return: The rolled data frame or dictionary of data frames
     :rtype: the one from df_or_dict
@@ -416,7 +420,7 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di
     rolling_direction = np.sign(rolling_direction)
 
     grouped_data = df.groupby(grouper)
-    maximum_number_of_timeshifts = grouped_data.count().max().max()
+    maximum_number_of_timeshifts = maximum_number_of_timeshifts or grouped_data.count().max().max()
 
     if np.isnan(maximum_number_of_timeshifts):
         raise ValueError("Somehow the maximum length of your time series is NaN (Does your time series container have "