diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cc13edeaf9..635e7db865 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1356,13 +1356,50 @@ def stack(self, how="left", levels: int = 1): index_columns = [*added_index_columns, *self.index_columns] index_labels = [*new_index_level_names, *self._index_labels] - block = Block( + return Block( unpivot_expr, index_columns=index_columns, column_labels=result_index, index_labels=index_labels, ) - return block + + def melt( + self, + id_vars=typing.Sequence[str], + value_vars=typing.Sequence[str], + var_names=typing.Sequence[typing.Hashable], + value_name: typing.Hashable = "value", + ): + # TODO: Implement col_level and ignore_index + unpivot_col_id = guid.generate_guid() + var_col_ids = tuple([guid.generate_guid() for _ in var_names]) + # single unpivot col + unpivot_col = (unpivot_col_id, tuple(value_vars)) + value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] + id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] + + dtype = self._expr.get_column_type(value_vars[0]) + + unpivot_expr = self._expr.unpivot( + row_labels=value_labels, + passthrough_columns=id_vars, + unpivot_columns=(unpivot_col,), + index_col_ids=var_col_ids, + dtype=dtype, + how="right", + ) + index_id = guid.generate_guid() + unpivot_expr = unpivot_expr.promote_offsets(index_id) + # Need to reorder to get id_vars before var_col and unpivot_col + unpivot_expr = unpivot_expr.select_columns( + [index_id, *id_vars, *var_col_ids, unpivot_col_id] + ) + + return Block( + unpivot_expr, + column_labels=[*id_labels, *var_names, value_name], + index_columns=[index_id], + ) def _create_stack_column( self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 01117d3e0a..49d7ad991a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1673,6 +1673,44 @@ def idxmin(self) -> bigframes.series.Series: def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) + def melt( + self, + id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + var_name: typing.Union[ + typing.Hashable, typing.Sequence[typing.Hashable] + ] = None, + value_name: typing.Hashable = "value", + ): + if var_name is None: + # Determine default var_name. Attempt to use column labels if they are unique + if self.columns.nlevels > 1: + if len(set(self.columns.names)) == len(self.columns.names): + var_name = self.columns.names + else: + var_name = [f"variable_{i}" for i in range(len(self.columns.names))] + else: + var_name = self.columns.name or "variable" + + var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,) + + if id_vars is not None: + id_col_ids = [self._resolve_label_exact(col) for col in id_vars] + else: + id_col_ids = [] + if value_vars is not None: + val_col_ids = [self._resolve_label_exact(col) for col in value_vars] + else: + val_col_ids = [ + col_id + for col_id in self._block.value_columns + if col_id not in id_col_ids + ] + + return DataFrame( + self._block.melt(id_col_ids, val_col_ids, var_name, value_name) + ) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e459e3bee3..b503f9a31d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1919,6 +1919,49 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_df_unstack(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a87dacae04..d6bf46f77c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -752,6 +752,34 @@ def test_column_multi_index_stack(level): ) +def test_column_multi_index_melt(): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + + level1 = pandas.Index(["b", "a", "b"]) + level2 = pandas.Index(["a", "b", "b"]) + level3 = pandas.Index(["b", "b", "a"]) + + multi_columns = pandas.MultiIndex.from_arrays( + [level1, level2, level3], names=["l1", "l2", "l3"] + ) + pd_df = pandas.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=[5, 2, None], + columns=multi_columns, + dtype="Int64", + ) + bf_df = bpd.DataFrame(pd_df) + + bf_result = bf_df.melt().to_pandas() + pd_result = pd_df.melt() + + # BigFrames uses different string and int types, but values are identical + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 13a81b4645..67836a8fd2 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2010,6 +2010,34 @@ def idxmax(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def melt(self, id_vars, value_vars, var_name, value_name): + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. + + Returns: + DataFrame: Unpivoted DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis.