From 93d3a00e25bc5de1ac54353960c00d3935d45063 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Wed, 27 Sep 2023 17:03:38 +0000 Subject: [PATCH 01/15] feat: Allow passing index objects to df.drop --- bigframes/dataframe.py | 12 +++++++++--- .../.ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 ++++++ .../small/.ipynb_checkpoints/untitled-checkpoint.py | 0 tests/system/small/test_dataframe.py | 10 ++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 113355589b..c0e6ed9ccf 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -906,16 +906,22 @@ def drop( columns = labels block = self._block - if index: + if index is not None: level_id = self._resolve_levels(level or 0)[0] - if utils.is_list_like(index): + if utils.is_list_like(index) or isinstance(index, pandas.Index): block, inverse_condition_id = block.apply_unary_op( level_id, ops.IsInOp(index, match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op ) + elif isinstance(index, indexes.Index): + # idea: make a value column with the same values as index + # align index with self so that the new value column is NA + # for rows that weren't in index originally + # then filter by the index's value column == self index + pass else: block, condition_id = block.apply_unary_op( level_id, ops.partial_right(ops.ne_op, index) @@ -925,7 +931,7 @@ def drop( ) if columns: block = block.drop_columns(self._sql_names(columns)) - if not index and not columns: + if index is None and not columns: raise ValueError("Must specify 'labels' or 'index'/'columns") return DataFrame(block) diff --git a/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000..363fcab7ed --- /dev/null +++ b/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py b/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index adf17848ee..d3670fdfee 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -258,6 +258,16 @@ def test_drop_index(scalars_dfs): pd.testing.assert_frame_equal(pd_result, bf_result) +def test_drop_pandas_index(scalars_dfs): + # TODO + pass + + +def test_drop_bigframes_index(scalars_dfs): + # TODO + pass + + def test_drop_labels_axis_0(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs From ee8eb0d431e72890ea52aa952f9ef8c5e7f2a5d9 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Wed, 27 Sep 2023 22:06:16 +0000 Subject: [PATCH 02/15] remove notebook files --- .../small/.ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 ------ .../system/small/.ipynb_checkpoints/untitled-checkpoint.py | 0 2 files changed, 6 deletions(-) delete mode 100644 tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb delete mode 100644 tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py diff --git a/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 363fcab7ed..0000000000 --- a/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py b/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py deleted file mode 100644 index e69de29bb2..0000000000 From 0fc8dd76d84cc6013a65d123e4d4ac4b3d961902 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 01:04:42 +0000 Subject: [PATCH 03/15] add first implementation for df.drop(index) --- bigframes/dataframe.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c0e6ed9ccf..50a5514290 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -917,11 +917,22 @@ def drop( inverse_condition_id, ops.invert_op ) elif isinstance(index, indexes.Index): - # idea: make a value column with the same values as index - # align index with self so that the new value column is NA - # for rows that weren't in index originally - # then filter by the index's value column == self index - pass + block = index._data._get_block() + original_value_columns = block.value_columns + block = blocks.Block(block._expr, [], block._expr.column_names.keys()) + level_names = ["level_" + str(n) for n in range(index.nlevels)] + block = block.set_index(level_names, drop=False) + index_df = DataFrame(block) + index_df = index_df.drop(columns=original_value_columns) + df_with_indices_to_drop = self.join(index_df) + bool_series = df_with_indices_to_drop["level_0"].isna() + for i in range(1, index.nlevels): + bool_series = ( + bool_series & df_with_indices_to_drop[level_names[i]].isna() + ) + result = df_with_indices_to_drop[bool_series] + result = result.drop(columns=level_names) + return result else: block, condition_id = block.apply_unary_op( level_id, ops.partial_right(ops.ne_op, index) From 5cdfff91d225bc71e414cd4d5235d533eb0bd33c Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 15:13:56 +0000 Subject: [PATCH 04/15] use index_columns property --- bigframes/dataframe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 50a5514290..ecf6a19199 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -919,19 +919,19 @@ def drop( elif isinstance(index, indexes.Index): block = index._data._get_block() original_value_columns = block.value_columns + original_index_columns = block.index_columns block = blocks.Block(block._expr, [], block._expr.column_names.keys()) - level_names = ["level_" + str(n) for n in range(index.nlevels)] - block = block.set_index(level_names, drop=False) + block = block.set_index(original_index_columns, drop=False) index_df = DataFrame(block) index_df = index_df.drop(columns=original_value_columns) df_with_indices_to_drop = self.join(index_df) - bool_series = df_with_indices_to_drop["level_0"].isna() - for i in range(1, index.nlevels): + bool_series = df_with_indices_to_drop[original_index_columns[0]].isna() + for index_name in original_index_columns[1:]: bool_series = ( - bool_series & df_with_indices_to_drop[level_names[i]].isna() + bool_series & df_with_indices_to_drop[index_name].isna() ) result = df_with_indices_to_drop[bool_series] - result = result.drop(columns=level_names) + result = result.drop(columns=list(original_index_columns)) return result else: block, condition_id = block.apply_unary_op( From 99f2f59a244b333c519060d44116074f3b2b318b Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 15:17:15 +0000 Subject: [PATCH 05/15] don't use _expr.keys() --- bigframes/dataframe.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ecf6a19199..6c8e30557d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -918,9 +918,11 @@ def drop( ) elif isinstance(index, indexes.Index): block = index._data._get_block() - original_value_columns = block.value_columns - original_index_columns = block.index_columns - block = blocks.Block(block._expr, [], block._expr.column_names.keys()) + original_value_columns = list(block.value_columns) + original_index_columns = list(block.index_columns) + block = blocks.Block( + block._expr, [], original_value_columns + original_index_columns + ) block = block.set_index(original_index_columns, drop=False) index_df = DataFrame(block) index_df = index_df.drop(columns=original_value_columns) @@ -931,7 +933,7 @@ def drop( bool_series & df_with_indices_to_drop[index_name].isna() ) result = df_with_indices_to_drop[bool_series] - result = result.drop(columns=list(original_index_columns)) + result = result.drop(columns=original_index_columns) return result else: block, condition_id = block.apply_unary_op( From f4db207d7b0c2020176cdc1aad8d052d77cc33eb Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 16:08:00 +0000 Subject: [PATCH 06/15] fix order bug and add test --- bigframes/dataframe.py | 2 +- .../Untitled-checkpoint.ipynb | 6 +++++ .../.ipynb_checkpoints/untitled-checkpoint.py | 0 tests/system/small/test_dataframe.py | 22 +++++++++++++++---- 4 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6c8e30557d..68f185f15a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -921,7 +921,7 @@ def drop( original_value_columns = list(block.value_columns) original_index_columns = list(block.index_columns) block = blocks.Block( - block._expr, [], original_value_columns + original_index_columns + block._expr, [], original_index_columns + original_value_columns ) block = block.set_index(original_index_columns, drop=False) index_df = DataFrame(block) diff --git a/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000..363fcab7ed --- /dev/null +++ b/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py b/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index d3670fdfee..f0e96951f8 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -259,13 +259,27 @@ def test_drop_index(scalars_dfs): def test_drop_pandas_index(scalars_dfs): - # TODO - pass + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) def test_drop_bigframes_index(scalars_dfs): - # TODO - pass + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_df.loc[[4, 1, 2]].index + drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + print(pd_result) + print(bf_result) + + pd.testing.assert_frame_equal(pd_result, bf_result) def test_drop_labels_axis_0(scalars_dfs): From ec7c6a743b0feddc1464a6b31ee6092a4e1d3517 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 16:10:44 +0000 Subject: [PATCH 07/15] fix index names --- bigframes/dataframe.py | 2 ++ tests/system/small/test_dataframe.py | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 68f185f15a..8c32097338 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -920,6 +920,7 @@ def drop( block = index._data._get_block() original_value_columns = list(block.value_columns) original_index_columns = list(block.index_columns) + original_index_names = self.index.names block = blocks.Block( block._expr, [], original_index_columns + original_value_columns ) @@ -934,6 +935,7 @@ def drop( ) result = df_with_indices_to_drop[bool_series] result = result.drop(columns=original_index_columns) + result.index.names = original_index_names return result else: block, condition_id = block.apply_unary_op( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f0e96951f8..a6ef62eabe 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -276,9 +276,6 @@ def test_drop_bigframes_index(scalars_dfs): pd_result = scalars_pandas_df.drop(index=drop_pandas_index) bf_result = scalars_df.drop(index=drop_index).to_pandas() - print(pd_result) - print(bf_result) - pd.testing.assert_frame_equal(pd_result, bf_result) From fd22b014c0de6f382a923fe72f809c6ca5d22591 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 18:03:46 +0000 Subject: [PATCH 08/15] support multiindex --- bigframes/dataframe.py | 65 +++++++++++++++++++--------- tests/system/small/test_dataframe.py | 36 +++++++++++++++ 2 files changed, 81 insertions(+), 20 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8c32097338..e7037d2a48 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -917,26 +917,7 @@ def drop( inverse_condition_id, ops.invert_op ) elif isinstance(index, indexes.Index): - block = index._data._get_block() - original_value_columns = list(block.value_columns) - original_index_columns = list(block.index_columns) - original_index_names = self.index.names - block = blocks.Block( - block._expr, [], original_index_columns + original_value_columns - ) - block = block.set_index(original_index_columns, drop=False) - index_df = DataFrame(block) - index_df = index_df.drop(columns=original_value_columns) - df_with_indices_to_drop = self.join(index_df) - bool_series = df_with_indices_to_drop[original_index_columns[0]].isna() - for index_name in original_index_columns[1:]: - bool_series = ( - bool_series & df_with_indices_to_drop[index_name].isna() - ) - result = df_with_indices_to_drop[bool_series] - result = result.drop(columns=original_index_columns) - result.index.names = original_index_names - return result + return self._drop_by_index(index) else: block, condition_id = block.apply_unary_op( level_id, ops.partial_right(ops.ne_op, index) @@ -950,6 +931,50 @@ def drop( raise ValueError("Must specify 'labels' or 'index'/'columns") return DataFrame(block) + def _drop_by_index(self, index: indexes.Index): + block = index._data._get_block() + original_value_columns = list(block.value_columns) + original_index_columns = list(block.index_columns) + original_index_names = self.index.names + # move all the columns to value columns + block = blocks.Block( + block._expr, [], original_index_columns + original_value_columns + ) + # additionally restore index columns in order to join + block = block.set_index(original_index_columns, drop=False) + index_df = DataFrame(block) + original_isna = index_df[original_index_columns[0]].isna() + for index_name in original_index_columns[1:]: + original_isna = original_isna & index_df[index_name].isna() + # used to drop NA-labeled rows later + original_has_all_na_row = original_isna.any() + + # value columns on the index argument are superfluous and could cause + # name conflicts, so we drop them + index_df = index_df.drop(columns=original_value_columns) + index_df.index.names = original_index_names + df_with_indices_to_drop = self.join(index_df) + # df_with_indices_to_drop has columns from the original index argument's + # index columns, and if all such columns are for a row, it means that + # row was not listed and therefore should be kept. All rows with entries in + # the original index argument should be dropped. + bool_series = df_with_indices_to_drop[original_index_columns[0]].isna() + for index_name in original_index_columns[1:]: + bool_series = bool_series & df_with_indices_to_drop[index_name].isna() + result = df_with_indices_to_drop[bool_series] + result = result.drop(columns=original_index_columns) + result.index.names = original_index_names + # if the user passed a label to drop, it will not be dropped yet, + # so we drop all labeled rows here if needed + if original_has_all_na_row: + num_keys = len(original_index_columns) + if num_keys == 1: + result = result.drop(index=[None]) + else: + none_key = [tuple([None] * num_keys)] + result = result.drop(index=none_key) + return result + def droplevel(self, level: LevelsType, axis: int | str = 0): axis_n = utils.get_axis_number(axis) if axis_n == 0: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a6ef62eabe..e957823182 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -279,6 +279,42 @@ def test_drop_bigframes_index(scalars_dfs): pd.testing.assert_frame_equal(pd_result, bf_result) +def test_drop_bigframes_index_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index("bytes_col") + scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") + drop_index = scalars_df.iloc[[3, 5]].index + drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + print(pd_result) + print(bf_result) + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + sub_df = scalars_df.iloc[[4, 1, 2]] + sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] + sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) + sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) + drop_index = sub_df.index + drop_pandas_index = sub_pandas_df.index + + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + + pd.testing.assert_frame_equal(pd_result, bf_result) + + def test_drop_labels_axis_0(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs From b0b0533bbd1affb48f3ac8c494913db4fc649fc0 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 18:05:04 +0000 Subject: [PATCH 09/15] remove accidentally added files --- .../small/.ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 ------ .../system/small/.ipynb_checkpoints/untitled-checkpoint.py | 0 2 files changed, 6 deletions(-) delete mode 100644 tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb delete mode 100644 tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py diff --git a/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 363fcab7ed..0000000000 --- a/tests/system/small/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py b/tests/system/small/.ipynb_checkpoints/untitled-checkpoint.py deleted file mode 100644 index e69de29bb2..0000000000 From 9a5afdd046817a08ce9cf5e306705407a3ced593 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 18:21:21 +0000 Subject: [PATCH 10/15] add type hint --- bigframes/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index e7037d2a48..5feca9c4bd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -931,7 +931,7 @@ def drop( raise ValueError("Must specify 'labels' or 'index'/'columns") return DataFrame(block) - def _drop_by_index(self, index: indexes.Index): + def _drop_by_index(self, index: indexes.Index) -> DataFrame: block = index._data._get_block() original_value_columns = list(block.value_columns) original_index_columns = list(block.index_columns) @@ -943,6 +943,7 @@ def _drop_by_index(self, index: indexes.Index): # additionally restore index columns in order to join block = block.set_index(original_index_columns, drop=False) index_df = DataFrame(block) + index_df.index.names = original_index_names original_isna = index_df[original_index_columns[0]].isna() for index_name in original_index_columns[1:]: original_isna = original_isna & index_df[index_name].isna() @@ -952,7 +953,6 @@ def _drop_by_index(self, index: indexes.Index): # value columns on the index argument are superfluous and could cause # name conflicts, so we drop them index_df = index_df.drop(columns=original_value_columns) - index_df.index.names = original_index_names df_with_indices_to_drop = self.join(index_df) # df_with_indices_to_drop has columns from the original index argument's # index columns, and if all such columns are for a row, it means that @@ -963,7 +963,6 @@ def _drop_by_index(self, index: indexes.Index): bool_series = bool_series & df_with_indices_to_drop[index_name].isna() result = df_with_indices_to_drop[bool_series] result = result.drop(columns=original_index_columns) - result.index.names = original_index_names # if the user passed a label to drop, it will not be dropped yet, # so we drop all labeled rows here if needed if original_has_all_na_row: From 451b901ea19051a70ddc97a791258e41756c31bd Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 28 Sep 2023 19:31:39 +0000 Subject: [PATCH 11/15] remove debug print statements --- tests/system/small/test_dataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e957823182..72f2ae47bc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -291,8 +291,6 @@ def test_drop_bigframes_index_with_na(scalars_dfs): pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) bf_result = scalars_df.drop(index=drop_index).to_pandas() - print(pd_result) - print(bf_result) pd.testing.assert_frame_equal(pd_result, bf_result) From 6a5ac4fa5aef976d18a388da53aaedcfd51308b4 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Mon, 2 Oct 2023 22:41:04 +0000 Subject: [PATCH 12/15] fix: fix df/series.iloc by list with multiindex --- bigframes/core/indexers.py | 15 ++-- tests/system/small/test_dataframe.py | 114 ++++++--------------------- 2 files changed, 31 insertions(+), 98 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index a538c80711..e22a5aed85 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -332,8 +332,6 @@ def _iloc_getitem_series_or_dataframe( elif isinstance(key, slice): return series_or_dataframe._slice(key.start, key.stop, key.step) elif pd.api.types.is_list_like(key): - # TODO(henryjsolberg): support MultiIndex - if len(key) == 0: return typing.cast( typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series], @@ -346,15 +344,18 @@ def _iloc_getitem_series_or_dataframe( original_series_name if original_series_name is not None else "0" ) df = series_or_dataframe.to_frame() - original_index_name = df.index.name - temporary_index_name = guid.generate_guid(prefix="temp_iloc_index_") - df = df.rename_axis(temporary_index_name) + original_index_names = df.index.names + temporary_index_names = [ + guid.generate_guid(prefix="temp_iloc_index_") + for _ in range(len(df.index.names)) + ] + df = df.rename_axis(temporary_index_names) # set to offset index and use regular loc, then restore index df = df.reset_index(drop=False) result = df.loc[key] - result = result.set_index(temporary_index_name) - result = result.rename_axis(original_index_name) + result = result.set_index(temporary_index_names) + result = result.rename_axis(original_index_names) if isinstance(series_or_dataframe, bigframes.series.Series): result = result[series_name] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3d50a609b2..afba8804d3 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1266,77 +1266,6 @@ def test_combine( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) -@pytest.mark.parametrize( - ("overwrite", "filter_func"), - [ - (True, None), - (False, None), - (True, lambda x: x.isna() | (x % 2 == 0)), - ], - ids=[ - "default", - "overwritefalse", - "customfilter", - ], -) -def test_df_update(overwrite, filter_func): - if pd.__version__.startswith("1."): - pytest.skip("dtype handled differently in pandas 1.x.") - index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") - index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") - pd_df1 = pandas.DataFrame( - {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 - ) - pd_df2 = pandas.DataFrame( - {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, - dtype="Int64", - index=index2, - ) - - bf_df1 = dataframe.DataFrame(pd_df1) - bf_df2 = dataframe.DataFrame(pd_df2) - - bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) - pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) - - pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) - - -@pytest.mark.parametrize( - ("join", "axis"), - [ - ("outer", None), - ("outer", 0), - ("outer", 1), - ("left", 0), - ("right", 1), - ("inner", None), - ("inner", 1), - ], -) -def test_df_align(join, axis): - index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") - index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") - pd_df1 = pandas.DataFrame( - {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 - ) - pd_df2 = pandas.DataFrame( - {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, - dtype="Int64", - index=index2, - ) - - bf_df1 = dataframe.DataFrame(pd_df1) - bf_df2 = dataframe.DataFrame(pd_df2) - - bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) - pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) - - # Don't check dtype as pandas does unnecessary float conversion - pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) - pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) - - def test_combine_first( scalars_df_index, scalars_df_2_index, @@ -1358,6 +1287,11 @@ def test_combine_first( pd_df_b.columns = ["b", "a", "d"] pd_result = pd_df_a.combine_first(pd_df_b) + print("pandas") + print(pd_result.to_string()) + print("bigframes") + print(bf_result.to_string()) + # Some dtype inconsistency for all-NULL columns pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @@ -1826,26 +1760,6 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) -def test_df_unstack(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = [ - "rowindex_2", - "int64_col", - "int64_too", - ] - - # unstack on mono-index produces series - bf_result = scalars_df[columns].unstack().to_pandas() - pd_result = scalars_pandas_df[columns].unstack() - - # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) - - @pytest.mark.parametrize( ("values", "index", "columns"), [ @@ -2580,6 +2494,24 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): ) +def test_iloc_list_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df.iloc[index_list] + pd_result = scalars_pandas_df.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): index_list = [] From 01334bd8d09908503adbe6bb857965385096b5f8 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Tue, 3 Oct 2023 17:06:49 +0000 Subject: [PATCH 13/15] complete merge --- tests/system/small/test_dataframe.py | 96 ++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 5 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ff9799d16b..db5689a7d5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1299,6 +1299,77 @@ def test_combine( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("overwrite", "filter_func"), + [ + (True, None), + (False, None), + (True, lambda x: x.isna() | (x % 2 == 0)), + ], + ids=[ + "default", + "overwritefalse", + "customfilter", + ], +) +def test_df_update(overwrite, filter_func): + if pd.__version__.startswith("1."): + pytest.skip("dtype handled differently in pandas 1.x.") + index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") + index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) + pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) + + pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + + +@pytest.mark.parametrize( + ("join", "axis"), + [ + ("outer", None), + ("outer", 0), + ("outer", 1), + ("left", 0), + ("right", 1), + ("inner", None), + ("inner", 1), + ], +) +def test_df_align(join, axis): + index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") + index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) + pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) + + # Don't check dtype as pandas does unnecessary float conversion + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + + def test_combine_first( scalars_df_index, scalars_df_2_index, @@ -1320,11 +1391,6 @@ def test_combine_first( pd_df_b.columns = ["b", "a", "d"] pd_result = pd_df_a.combine_first(pd_df_b) - print("pandas") - print(pd_result.to_string()) - print("bigframes") - print(bf_result.to_string()) - # Some dtype inconsistency for all-NULL columns pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @@ -1793,6 +1859,26 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_unstack(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = [ + "rowindex_2", + "int64_col", + "int64_too", + ] + + # unstack on mono-index produces series + bf_result = scalars_df[columns].unstack().to_pandas() + pd_result = scalars_pandas_df[columns].unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("values", "index", "columns"), [ From a400b1d5cfba49fc7da26c34ddf56c285df15bef Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Tue, 3 Oct 2023 17:09:40 +0000 Subject: [PATCH 14/15] remove unneeded isinstance --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 476aa53cbe..a2104f81dd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -932,7 +932,7 @@ def drop( if index is not None: level_id = self._resolve_levels(level or 0)[0] - if utils.is_list_like(index) or isinstance(index, pandas.Index): + if utils.is_list_like(index): block, inverse_condition_id = block.apply_unary_op( level_id, ops.IsInOp(index, match_nulls=True) ) From afbf8c3a3ab0e6fa188504cd1ac90b22805b163d Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Tue, 3 Oct 2023 20:08:01 +0000 Subject: [PATCH 15/15] refactor _drop_by_index --- bigframes/dataframe.py | 56 +++++++++++++----------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a2104f81dd..1770898345 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -956,46 +956,24 @@ def drop( def _drop_by_index(self, index: indexes.Index) -> DataFrame: block = index._data._get_block() - original_value_columns = list(block.value_columns) - original_index_columns = list(block.index_columns) - original_index_names = self.index.names - # move all the columns to value columns - block = blocks.Block( - block._expr, [], original_index_columns + original_value_columns + block, ordering_col = block.promote_offsets() + joined_index, (get_column_left, get_column_right) = self._block.index.join( + block.index ) - # additionally restore index columns in order to join - block = block.set_index(original_index_columns, drop=False) - index_df = DataFrame(block) - index_df.index.names = original_index_names - original_isna = index_df[original_index_columns[0]].isna() - for index_name in original_index_columns[1:]: - original_isna = original_isna & index_df[index_name].isna() - # used to drop NA-labeled rows later - original_has_all_na_row = original_isna.any() - - # value columns on the index argument are superfluous and could cause - # name conflicts, so we drop them - index_df = index_df.drop(columns=original_value_columns) - df_with_indices_to_drop = self.join(index_df) - # df_with_indices_to_drop has columns from the original index argument's - # index columns, and if all such columns are for a row, it means that - # row was not listed and therefore should be kept. All rows with entries in - # the original index argument should be dropped. - bool_series = df_with_indices_to_drop[original_index_columns[0]].isna() - for index_name in original_index_columns[1:]: - bool_series = bool_series & df_with_indices_to_drop[index_name].isna() - result = df_with_indices_to_drop[bool_series] - result = result.drop(columns=original_index_columns) - # if the user passed a label to drop, it will not be dropped yet, - # so we drop all labeled rows here if needed - if original_has_all_na_row: - num_keys = len(original_index_columns) - if num_keys == 1: - result = result.drop(index=[None]) - else: - none_key = [tuple([None] * num_keys)] - result = result.drop(index=none_key) - return result + + new_ordering_col = get_column_right(ordering_col) + drop_block = joined_index._block + drop_block, drop_col = drop_block.apply_unary_op( + new_ordering_col, + ops.isnull_op, + ) + + drop_block = drop_block.filter(drop_col) + original_columns = [ + get_column_left(column) for column in self._block.value_columns + ] + drop_block = drop_block.select_columns(original_columns) + return DataFrame(drop_block) def droplevel(self, level: LevelsType, axis: int | str = 0): axis_n = utils.get_axis_number(axis)