From 72acc3a73a3ad6a6f2c6c41c9b9c81afd1e9ed0f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 17 Oct 2023 00:34:29 +0000 Subject: [PATCH 1/4] feat: add dataframe melt --- bigframes/core/blocks.py | 41 ++++++++++++++++- bigframes/dataframe.py | 38 ++++++++++++++++ tests/system/small/test_dataframe.py | 44 +++++++++++++++++++ tests/system/small/test_multiindex.py | 29 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 28 ++++++++++++ 5 files changed, 178 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 4548fca593..49e610e037 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1338,13 +1338,50 @@ def stack(self, how="left", levels: int = 1): index_columns = [*added_index_columns, *self.index_columns] index_labels = [*new_index_level_names, *self._index_labels] - block = Block( + return Block( unpivot_expr, index_columns=index_columns, column_labels=result_index, index_labels=index_labels, ) - return block + + def melt( + self, + id_vars=typing.Sequence[str], + value_vars=typing.Sequence[str], + var_names=typing.Sequence[typing.Hashable], + value_name: typing.Hashable = "value", + ): + # TODO: Implement col_level and ignore_index + unpivot_col_id = guid.generate_guid() + var_col_ids = tuple([guid.generate_guid() for _ in var_names]) + # single unpivot col + unpivot_col = (unpivot_col_id, tuple(value_vars)) + value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] + id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] + + dtype = self._expr.get_column_type(value_vars[0]) + + unpivot_expr = self._expr.unpivot( + row_labels=value_labels, + passthrough_columns=id_vars, + unpivot_columns=(unpivot_col,), + index_col_ids=var_col_ids, + dtype=dtype, + how="right", + ) + index_id = guid.generate_guid() + unpivot_expr = unpivot_expr.promote_offsets(index_id) + # Need to reorder to get id_vars before var_col and unpivot_col + unpivot_expr = unpivot_expr.select_columns( + [index_id, *id_vars, *var_col_ids, unpivot_col_id] + ) + + return Block( + unpivot_expr, + column_labels=[*id_labels, *var_names, value_name], + index_columns=[index_id], + ) def _create_stack_column( self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c91ddffada..3406675827 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1683,6 +1683,44 @@ def idxmin(self) -> bigframes.series.Series: def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) + def melt( + self, + id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + var_name: typing.Union[ + typing.Hashable, typing.Sequence[typing.Hashable] + ] = None, + value_name: typing.Hashable = "value", + ): + if var_name is None: + # Determine default var_name. Attempt to use column labels if they are unique + if self.columns.nlevels > 1: + if len(set(self.columns.names)) == len(self.columns.names): + var_name = self.columns.names + else: + var_name = [f"variable_{i}" for i in range(len(self.columns.names))] + else: + var_name = self.columns.name or "variable" + + var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,) + + if id_vars is not None: + id_col_ids = [self._resolve_label_exact(col) for col in id_vars] + else: + id_col_ids = [] + if value_vars is not None: + val_col_ids = [self._resolve_label_exact(col) for col in value_vars] + else: + val_col_ids = [ + col_id + for col_id in self._block.value_columns + if col_id not in id_col_ids + ] + + return DataFrame( + self._block.melt(id_col_ids, val_col_ids, var_name, value_name) + ) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9f1092d09d..0d56fa54b5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1901,6 +1901,50 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + print("pandas") + print(pd_result.to_string()) + print("bigframes") + print(bf_result.to_string()) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_df_unstack(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a132676770..00cfe4dca2 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -752,6 +752,35 @@ def test_column_multi_index_stack(level): ) +def test_column_multi_index_melt(): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + + level1 = pandas.Index(["b", "a", "b"]) + level2 = pandas.Index(["a", "b", "b"]) + level3 = pandas.Index(["b", "b", "a"]) + + multi_columns = pandas.MultiIndex.from_arrays( + [level1, level2, level3], names=["l1", "l2", "l3"] + ) + pd_df = pandas.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=[5, 2, None], + columns=multi_columns, + dtype="Int64", + ) + bf_df = bpd.DataFrame(pd_df) + + bf_result = bf_df.melt().to_pandas() + # BigFrames emulates future_stack impl + pd_result = pd_df.melt() + + # BigFrames uses different string and int types, but values are identical + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e54f984d59..cf67db719f 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1854,6 +1854,34 @@ def idxmax(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def melt(self, id_vars, value_vars, var_name, value_name): + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. + + Returns: + DataFrame: Unpivoted DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. From a3fced2ed8ab1cf6cbf98ec86444c1610cb5fde9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 17 Oct 2023 20:35:21 +0000 Subject: [PATCH 2/4] remoted test print statements --- tests/system/small/test_dataframe.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0d56fa54b5..5211c831da 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1936,11 +1936,6 @@ def test_df_melt_parameterized(scalars_dfs): value_vars=["int64_col", "int64_too"], ) - print("pandas") - print(pd_result.to_string()) - print("bigframes") - print(bf_result.to_string()) - # Pandas produces int64 index, Bigframes produces Int64 (nullable) pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) From 6fa8a8b43b6f8f3b010356e0357addf844d16361 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 26 Oct 2023 17:13:09 +0000 Subject: [PATCH 3/4] make tests looser to pass on all pandas versions --- tests/system/small/test_dataframe.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index fe961f3fa5..cc8c9bbdb6 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1913,7 +1913,9 @@ def test_df_melt_default(scalars_dfs): pd_result = scalars_pandas_df[columns].melt() # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) def test_df_melt_parameterized(scalars_dfs): @@ -1937,7 +1939,9 @@ def test_df_melt_parameterized(scalars_dfs): ) # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) def test_df_unstack(scalars_dfs): From 7f879ea565d5c645bfc0f26d39e5da180b87db43 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 28 Oct 2023 01:44:46 +0000 Subject: [PATCH 4/4] remove misplaced code comment from test --- tests/system/small/test_multiindex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 3ce8276cf7..d6bf46f77c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -772,7 +772,6 @@ def test_column_multi_index_melt(): bf_df = bpd.DataFrame(pd_df) bf_result = bf_df.melt().to_pandas() - # BigFrames emulates future_stack impl pd_result = pd_df.melt() # BigFrames uses different string and int types, but values are identical