From 72acc3a73a3ad6a6f2c6c41c9b9c81afd1e9ed0f Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Tue, 17 Oct 2023 00:34:29 +0000
Subject: [PATCH 1/4] feat: add dataframe melt

---
 bigframes/core/blocks.py                      | 41 ++++++++++++++++-
 bigframes/dataframe.py                        | 38 ++++++++++++++++
 tests/system/small/test_dataframe.py          | 44 +++++++++++++++++++
 tests/system/small/test_multiindex.py         | 29 ++++++++++++
 .../bigframes_vendored/pandas/core/frame.py   | 28 ++++++++++++
 5 files changed, 178 insertions(+), 2 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 4548fca593..49e610e037 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1338,13 +1338,50 @@ def stack(self, how="left", levels: int = 1):
             index_columns = [*added_index_columns, *self.index_columns]
             index_labels = [*new_index_level_names, *self._index_labels]
 
-        block = Block(
+        return Block(
             unpivot_expr,
             index_columns=index_columns,
             column_labels=result_index,
             index_labels=index_labels,
         )
-        return block
+
+    def melt(
+        self,
+        id_vars=typing.Sequence[str],
+        value_vars=typing.Sequence[str],
+        var_names=typing.Sequence[typing.Hashable],
+        value_name: typing.Hashable = "value",
+    ):
+        # TODO: Implement col_level and ignore_index
+        unpivot_col_id = guid.generate_guid()
+        var_col_ids = tuple([guid.generate_guid() for _ in var_names])
+        # single unpivot col
+        unpivot_col = (unpivot_col_id, tuple(value_vars))
+        value_labels = [self.col_id_to_label[col_id] for col_id in value_vars]
+        id_labels = [self.col_id_to_label[col_id] for col_id in id_vars]
+
+        dtype = self._expr.get_column_type(value_vars[0])
+
+        unpivot_expr = self._expr.unpivot(
+            row_labels=value_labels,
+            passthrough_columns=id_vars,
+            unpivot_columns=(unpivot_col,),
+            index_col_ids=var_col_ids,
+            dtype=dtype,
+            how="right",
+        )
+        index_id = guid.generate_guid()
+        unpivot_expr = unpivot_expr.promote_offsets(index_id)
+        # Need to reorder to get id_vars before var_col and unpivot_col
+        unpivot_expr = unpivot_expr.select_columns(
+            [index_id, *id_vars, *var_col_ids, unpivot_col_id]
+        )
+
+        return Block(
+            unpivot_expr,
+            column_labels=[*id_labels, *var_names, value_name],
+            index_columns=[index_id],
+        )
 
     def _create_stack_column(
         self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index c91ddffada..3406675827 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1683,6 +1683,44 @@ def idxmin(self) -> bigframes.series.Series:
     def idxmax(self) -> bigframes.series.Series:
         return bigframes.series.Series(block_ops.idxmax(self._block))
 
+    def melt(
+        self,
+        id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None,
+        value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None,
+        var_name: typing.Union[
+            typing.Hashable, typing.Sequence[typing.Hashable]
+        ] = None,
+        value_name: typing.Hashable = "value",
+    ):
+        if var_name is None:
+            # Determine default var_name. Attempt to use column labels if they are unique
+            if self.columns.nlevels > 1:
+                if len(set(self.columns.names)) == len(self.columns.names):
+                    var_name = self.columns.names
+                else:
+                    var_name = [f"variable_{i}" for i in range(len(self.columns.names))]
+            else:
+                var_name = self.columns.name or "variable"
+
+        var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,)
+
+        if id_vars is not None:
+            id_col_ids = [self._resolve_label_exact(col) for col in id_vars]
+        else:
+            id_col_ids = []
+        if value_vars is not None:
+            val_col_ids = [self._resolve_label_exact(col) for col in value_vars]
+        else:
+            val_col_ids = [
+                col_id
+                for col_id in self._block.value_columns
+                if col_id not in id_col_ids
+            ]
+
+        return DataFrame(
+            self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
+        )
+
     def describe(self) -> DataFrame:
         df_numeric = self._drop_non_numeric(keep_bool=False)
         if len(df_numeric.columns) == 0:
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 9f1092d09d..0d56fa54b5 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1901,6 +1901,50 @@ def test_df_stack(scalars_dfs):
     pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
 
 
+def test_df_melt_default(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    # To match bigquery dataframes
+    scalars_pandas_df = scalars_pandas_df.copy()
+    scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
+    # Can only stack identically-typed columns
+    columns = ["int64_col", "int64_too", "rowindex_2"]
+
+    bf_result = scalars_df[columns].melt().to_pandas()
+    pd_result = scalars_pandas_df[columns].melt()
+
+    # Pandas produces int64 index, Bigframes produces Int64 (nullable)
+    pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
+def test_df_melt_parameterized(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    # To match bigquery dataframes
+    scalars_pandas_df = scalars_pandas_df.copy()
+    scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
+    # Can only stack identically-typed columns
+
+    bf_result = scalars_df.melt(
+        var_name="alice",
+        value_name="bob",
+        id_vars=["string_col"],
+        value_vars=["int64_col", "int64_too"],
+    ).to_pandas()
+    pd_result = scalars_pandas_df.melt(
+        var_name="alice",
+        value_name="bob",
+        id_vars=["string_col"],
+        value_vars=["int64_col", "int64_too"],
+    )
+
+    print("pandas")
+    print(pd_result.to_string())
+    print("bigframes")
+    print(bf_result.to_string())
+
+    # Pandas produces int64 index, Bigframes produces Int64 (nullable)
+    pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
 def test_df_unstack(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     # To match bigquery dataframes
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index a132676770..00cfe4dca2 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -752,6 +752,35 @@ def test_column_multi_index_stack(level):
     )
 
 
+def test_column_multi_index_melt():
+    if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"):
+        pytest.skip("pandas <2.1 uses different stack implementation")
+
+    level1 = pandas.Index(["b", "a", "b"])
+    level2 = pandas.Index(["a", "b", "b"])
+    level3 = pandas.Index(["b", "b", "a"])
+
+    multi_columns = pandas.MultiIndex.from_arrays(
+        [level1, level2, level3], names=["l1", "l2", "l3"]
+    )
+    pd_df = pandas.DataFrame(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        index=[5, 2, None],
+        columns=multi_columns,
+        dtype="Int64",
+    )
+    bf_df = bpd.DataFrame(pd_df)
+
+    bf_result = bf_df.melt().to_pandas()
+    # BigFrames emulates future_stack impl
+    pd_result = pd_df.melt()
+
+    # BigFrames uses different string and int types, but values are identical
+    pandas.testing.assert_frame_equal(
+        bf_result, pd_result, check_index_type=False, check_dtype=False
+    )
+
+
 def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index):
     columns = ["int64_too", "int64_col", "rowindex_2"]
     level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]")
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index e54f984d59..cf67db719f 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1854,6 +1854,34 @@ def idxmax(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def melt(self, id_vars, value_vars, var_name, value_name):
+        """
+        Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
+
+        This function is useful to massage a DataFrame into a format where one
+        or more columns are identifier variables (`id_vars`), while all other
+        columns, considered measured variables (`value_vars`), are "unpivoted" to
+        the row axis, leaving just two non-identifier columns, 'variable' and
+        'value'.
+
+        Parameters
+        ----------
+        id_vars (tuple, list, or ndarray, optional):
+            Column(s) to use as identifier variables.
+        value_vars (tuple, list, or ndarray, optional):
+            Column(s) to unpivot. If not specified, uses all columns that
+            are not set as `id_vars`.
+        var_name (scalar):
+            Name to use for the 'variable' column. If None it uses
+            ``frame.columns.name`` or 'variable'.
+        value_name (scalar, default 'value'):
+            Name to use for the 'value' column.
+
+        Returns:
+            DataFrame: Unpivoted DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def nunique(self):
         """
         Count number of distinct elements in specified axis.

From a3fced2ed8ab1cf6cbf98ec86444c1610cb5fde9 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Tue, 17 Oct 2023 20:35:21 +0000
Subject: [PATCH 2/4] remoted test print statements

---
 tests/system/small/test_dataframe.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 0d56fa54b5..5211c831da 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1936,11 +1936,6 @@ def test_df_melt_parameterized(scalars_dfs):
         value_vars=["int64_col", "int64_too"],
     )
 
-    print("pandas")
-    print(pd_result.to_string())
-    print("bigframes")
-    print(bf_result.to_string())
-
     # Pandas produces int64 index, Bigframes produces Int64 (nullable)
     pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
 

From 6fa8a8b43b6f8f3b010356e0357addf844d16361 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 26 Oct 2023 17:13:09 +0000
Subject: [PATCH 3/4] make tests looser to pass on all pandas versions

---
 tests/system/small/test_dataframe.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index fe961f3fa5..cc8c9bbdb6 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1913,7 +1913,9 @@ def test_df_melt_default(scalars_dfs):
     pd_result = scalars_pandas_df[columns].melt()
 
     # Pandas produces int64 index, Bigframes produces Int64 (nullable)
-    pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+    pd.testing.assert_frame_equal(
+        bf_result, pd_result, check_index_type=False, check_dtype=False
+    )
 
 
 def test_df_melt_parameterized(scalars_dfs):
@@ -1937,7 +1939,9 @@ def test_df_melt_parameterized(scalars_dfs):
     )
 
     # Pandas produces int64 index, Bigframes produces Int64 (nullable)
-    pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+    pd.testing.assert_frame_equal(
+        bf_result, pd_result, check_index_type=False, check_dtype=False
+    )
 
 
 def test_df_unstack(scalars_dfs):

From 7f879ea565d5c645bfc0f26d39e5da180b87db43 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Sat, 28 Oct 2023 01:44:46 +0000
Subject: [PATCH 4/4] remove misplaced code comment from test

---
 tests/system/small/test_multiindex.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index 3ce8276cf7..d6bf46f77c 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -772,7 +772,6 @@ def test_column_multi_index_melt():
     bf_df = bpd.DataFrame(pd_df)
 
     bf_result = bf_df.melt().to_pandas()
-    # BigFrames emulates future_stack impl
     pd_result = pd_df.melt()
 
     # BigFrames uses different string and int types, but values are identical