From d6bc14ff338958197ce7bb31d93aa65793d29f07 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 17 Mar 2025 22:49:16 +0000 Subject: [PATCH 1/7] feat: support iloc with negative indices --- bigframes/core/indexers.py | 31 +++++++++++++++++++++++++++- tests/system/small/test_dataframe.py | 2 +- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index d1a0c42e97..eed932c156 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -497,7 +497,36 @@ def _iloc_getitem_series_or_dataframe( block = df._block # explicitly set index to offsets, reset_index may not generate offsets in some modes block, offsets_id = block.promote_offsets("temp_iloc_offsets_") - block = block.set_index([offsets_id]) + pos_block = block.set_index([offsets_id]) + max_agg_id = guid.generate_guid() + max_agg_specs = [ + ( + ex.UnaryAggregation(ops.aggregations.max_op, ex.deref(offsets_id)), + max_agg_id, + ), + ] + plus_one_expr = ops.add_op.as_expr(ex.deref(max_agg_id), ex.const(1)) + max_agg_expr, _ = block.expr.aggregate(max_agg_specs).project_to_id( + plus_one_expr + ) + + max_index_block = bigframes.core.blocks.Block( + max_agg_expr.drop_columns(max_agg_id), + column_labels=["row_count"], + index_columns=[], + ) + block = block.reset_index(drop=False).merge( + max_index_block, how="cross", left_join_ids=[], right_join_ids=[], sort=True + ) + + block, _ = block.apply_binary_op( + block.value_columns[-2], block.value_columns[-1], ops.sub_op + ) + + block = block.set_index([block.value_columns[-1]]) + + block = pos_block.concat([block], how="inner") + df = bigframes.dataframe.DataFrame(block) result = df.loc[key] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 37144265b8..05f0e504bd 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4409,7 +4409,7 @@ def test_loc_list_multiindex(scalars_dfs_maybe_ordered): def test_iloc_list(scalars_df_index, scalars_pandas_df_index): - index_list = [0, 0, 0, 5, 4, 7] + index_list = [0, 0, 0, 5, 4, 7, -2, -5, 3] bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] From d57ed9b94bf32120d37626184a2aa87ea16ee789 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 17 Mar 2025 22:50:18 +0000 Subject: [PATCH 2/7] update partial ordering test --- tests/system/small/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 05f0e504bd..dbff13fefd 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4423,7 +4423,7 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): def test_iloc_list_partial_ordering( scalars_df_partial_ordering, scalars_pandas_df_index ): - index_list = [0, 0, 0, 5, 4, 7] + index_list = [0, 0, 0, 5, 4, 7, -2, -5, 3] bf_result = scalars_df_partial_ordering.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] From d839ce8701a797111a4507a62c08f1ac73ebaa7d Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 17 Mar 2025 22:53:10 +0000 Subject: [PATCH 3/7] update naming --- bigframes/core/indexers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index eed932c156..969305be21 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -505,10 +505,8 @@ def _iloc_getitem_series_or_dataframe( max_agg_id, ), ] - plus_one_expr = ops.add_op.as_expr(ex.deref(max_agg_id), ex.const(1)) - max_agg_expr, _ = block.expr.aggregate(max_agg_specs).project_to_id( - plus_one_expr - ) + plus_one_op = ops.add_op.as_expr(ex.deref(max_agg_id), ex.const(1)) + max_agg_expr, _ = block.expr.aggregate(max_agg_specs).project_to_id(plus_one_op) max_index_block = bigframes.core.blocks.Block( max_agg_expr.drop_columns(max_agg_id), From ea61d0708fa148a3dc1e52cdca4c3c4564d43590 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 18 Mar 2025 23:53:21 +0000 Subject: [PATCH 4/7] update logic --- bigframes/core/compile/aggregate_compiler.py | 6 +++ bigframes/core/indexers.py | 54 +++++++++++--------- bigframes/operations/aggregations.py | 8 +++ 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index f96471e200..0c74cdadf3 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -350,6 +350,12 @@ def _( return _apply_window_if_present(column.count(), window) +@compile_unary_agg.register +def _(op: agg_ops.ReverseRowNumberOp, column: ibis_types.NumericColumn, window=None): + row_count = _apply_window_if_present(ibis_ops.count(1), window) + return column - row_count + + @compile_unary_agg.register def _( op: agg_ops.CutOp, diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 969305be21..96370eaef7 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -27,6 +27,7 @@ import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar +import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions as bfe @@ -477,6 +478,24 @@ def _iloc_getitem_series_or_dataframe( Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) + + # Check if both positive row numbers and negative row numbers are necessary + if isinstance(key, bigframes.series.Series): + # Avoid data download + is_key_unisigned = False + elif "shape" in series_or_dataframe._block.__dict__: + # If there is a cache, we convert all indices to positive. + row_count = series_or_dataframe._block.shape[0] + key = [k if k >= 0 else row_count + k for k in key] + is_key_unisigned = True + else: + first_sign = key[0] >= 0 + is_key_unisigned = True + for k in key: + if (k >= 0) != first_sign: + is_key_unisigned = False + break + if isinstance(series_or_dataframe, bigframes.series.Series): original_series_name = series_or_dataframe.name series_name = ( @@ -498,32 +517,21 @@ def _iloc_getitem_series_or_dataframe( # explicitly set index to offsets, reset_index may not generate offsets in some modes block, offsets_id = block.promote_offsets("temp_iloc_offsets_") pos_block = block.set_index([offsets_id]) - max_agg_id = guid.generate_guid() - max_agg_specs = [ - ( - ex.UnaryAggregation(ops.aggregations.max_op, ex.deref(offsets_id)), - max_agg_id, - ), - ] - plus_one_op = ops.add_op.as_expr(ex.deref(max_agg_id), ex.const(1)) - max_agg_expr, _ = block.expr.aggregate(max_agg_specs).project_to_id(plus_one_op) - - max_index_block = bigframes.core.blocks.Block( - max_agg_expr.drop_columns(max_agg_id), - column_labels=["row_count"], - index_columns=[], - ) - block = block.reset_index(drop=False).merge( - max_index_block, how="cross", left_join_ids=[], right_join_ids=[], sort=True - ) - block, _ = block.apply_binary_op( - block.value_columns[-2], block.value_columns[-1], ops.sub_op - ) + if not is_key_unisigned or key[0] < 0: + neg_block, _ = block.apply_window_op( + offsets_id, + ops.aggregations.ReverseRowNumberOp(), + window_spec=windows.rows(), + result_label="Reversed_Index", + ) - block = block.set_index([block.value_columns[-1]]) + neg_block = neg_block.set_index([neg_block.value_columns[-1]]) - block = pos_block.concat([block], how="inner") + if is_key_unisigned: + block = pos_block if key[0] >= 0 else neg_block + else: + block = pos_block.concat([neg_block], how="inner") df = bigframes.dataframe.DataFrame(block) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 933bf972d0..974abd30b3 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -335,6 +335,14 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) +@dataclasses.dataclass(frozen=True) +class ReverseRowNumberOp(UnaryWindowOp): + name: ClassVar[str] = "size" + + def output_type(self, *input_types): + return dtypes.INT_DTYPE + + @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? From fade680d74797def9c051e55dfa81196a01072cf Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 18 Mar 2025 23:55:20 +0000 Subject: [PATCH 5/7] update comment --- bigframes/core/indexers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 96370eaef7..3d2375207e 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -479,7 +479,7 @@ def _iloc_getitem_series_or_dataframe( series_or_dataframe.iloc[0:0], ) - # Check if both positive row numbers and negative row numbers are necessary + # Check if both positive index and negative index are necessary if isinstance(key, bigframes.series.Series): # Avoid data download is_key_unisigned = False From 7f585043c0bc3da57167492f78ae442a3c1a6689 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 19 Mar 2025 17:56:40 +0000 Subject: [PATCH 6/7] update logic and tests --- bigframes/core/compile/aggregate_compiler.py | 6 ----- bigframes/core/indexers.py | 17 +++++++------- bigframes/operations/aggregations.py | 8 ------- tests/system/small/test_dataframe.py | 24 +++++++++++++++----- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 0c74cdadf3..f96471e200 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -350,12 +350,6 @@ def _( return _apply_window_if_present(column.count(), window) -@compile_unary_agg.register -def _(op: agg_ops.ReverseRowNumberOp, column: ibis_types.NumericColumn, window=None): - row_count = _apply_window_if_present(ibis_ops.count(1), window) - return column - row_count - - @compile_unary_agg.register def _( op: agg_ops.CutOp, diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 3d2375207e..e07173449a 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -483,11 +483,6 @@ def _iloc_getitem_series_or_dataframe( if isinstance(key, bigframes.series.Series): # Avoid data download is_key_unisigned = False - elif "shape" in series_or_dataframe._block.__dict__: - # If there is a cache, we convert all indices to positive. - row_count = series_or_dataframe._block.shape[0] - key = [k if k >= 0 else row_count + k for k in key] - is_key_unisigned = True else: first_sign = key[0] >= 0 is_key_unisigned = True @@ -519,14 +514,18 @@ def _iloc_getitem_series_or_dataframe( pos_block = block.set_index([offsets_id]) if not is_key_unisigned or key[0] < 0: - neg_block, _ = block.apply_window_op( + neg_block, size_col_id = block.apply_window_op( offsets_id, - ops.aggregations.ReverseRowNumberOp(), + ops.aggregations.SizeUnaryOp(), window_spec=windows.rows(), - result_label="Reversed_Index", + ) + neg_block, neg_index_id = neg_block.apply_binary_op( + offsets_id, size_col_id, ops.SubOp() ) - neg_block = neg_block.set_index([neg_block.value_columns[-1]]) + neg_block = neg_block.set_index([neg_index_id]).drop_columns( + [size_col_id, offsets_id] + ) if is_key_unisigned: block = pos_block if key[0] >= 0 else neg_block diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index c9a461cc35..d25791d3e4 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -335,14 +335,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -@dataclasses.dataclass(frozen=True) -class ReverseRowNumberOp(UnaryWindowOp): - name: ClassVar[str] = "size" - - def output_type(self, *input_types): - return dtypes.INT_DTYPE - - @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1791e74357..c2e4a1c8ad 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4400,9 +4400,15 @@ def test_loc_list_multiindex(scalars_dfs_maybe_ordered): ) -def test_iloc_list(scalars_df_index, scalars_pandas_df_index): - index_list = [0, 0, 0, 5, 4, 7, -2, -5, 3] - +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] @@ -4412,11 +4418,17 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) def test_iloc_list_partial_ordering( - scalars_df_partial_ordering, scalars_pandas_df_index + scalars_df_partial_ordering, scalars_pandas_df_index, index_list ): - index_list = [0, 0, 0, 5, 4, 7, -2, -5, 3] - bf_result = scalars_df_partial_ordering.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] From 130fde35e9fd497993fb9bdac1f0c121e980eed5 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 19 Mar 2025 20:43:29 +0000 Subject: [PATCH 7/7] update filter --- bigframes/core/indexers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index e07173449a..6258eb00d5 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -480,7 +480,7 @@ def _iloc_getitem_series_or_dataframe( ) # Check if both positive index and negative index are necessary - if isinstance(key, bigframes.series.Series): + if isinstance(key, (bigframes.series.Series, indexes.Index)): # Avoid data download is_key_unisigned = False else: