Skip to content

feat: Allow df.drop to take an index object #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ def drop(
columns = labels

block = self._block
if index:
if index is not None:
level_id = self._resolve_levels(level or 0)[0]

if utils.is_list_like(index):
Expand All @@ -947,6 +947,8 @@ def drop(
block, condition_id = block.apply_unary_op(
inverse_condition_id, ops.invert_op
)
elif isinstance(index, indexes.Index):
return self._drop_by_index(index)
else:
block, condition_id = block.apply_unary_op(
level_id, ops.partial_right(ops.ne_op, index)
Expand All @@ -956,10 +958,31 @@ def drop(
)
if columns:
block = block.drop_columns(self._sql_names(columns))
if not index and not columns:
if index is None and not columns:
raise ValueError("Must specify 'labels' or 'index'/'columns")
return DataFrame(block)

def _drop_by_index(self, index: indexes.Index) -> DataFrame:
block = index._data._get_block()
block, ordering_col = block.promote_offsets()
joined_index, (get_column_left, get_column_right) = self._block.index.join(
block.index
)

new_ordering_col = get_column_right(ordering_col)
drop_block = joined_index._block
drop_block, drop_col = drop_block.apply_unary_op(
new_ordering_col,
ops.isnull_op,
)

drop_block = drop_block.filter(drop_col)
original_columns = [
get_column_left(column) for column in self._block.value_columns
]
drop_block = drop_block.select_columns(original_columns)
return DataFrame(drop_block)

def droplevel(self, level: LevelsType, axis: int | str = 0):
axis_n = utils.get_axis_number(axis)
if axis_n == 0:
Expand Down
55 changes: 55 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,61 @@ def test_drop_index(scalars_dfs):
pd.testing.assert_frame_equal(pd_result, bf_result)


def test_drop_pandas_index(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index

pd_result = scalars_pandas_df.drop(index=drop_index)
bf_result = scalars_df.drop(index=drop_index).to_pandas()

pd.testing.assert_frame_equal(pd_result, bf_result)


def test_drop_bigframes_index(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
drop_index = scalars_df.loc[[4, 1, 2]].index
drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index

pd_result = scalars_pandas_df.drop(index=drop_pandas_index)
bf_result = scalars_df.drop(index=drop_index).to_pandas()

pd.testing.assert_frame_equal(pd_result, bf_result)


def test_drop_bigframes_index_with_na(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
scalars_df = scalars_df.copy()
scalars_pandas_df = scalars_pandas_df.copy()
scalars_df = scalars_df.set_index("bytes_col")
scalars_pandas_df = scalars_pandas_df.set_index("bytes_col")
drop_index = scalars_df.iloc[[3, 5]].index
drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index

pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index)
bf_result = scalars_df.drop(index=drop_index).to_pandas()

pd.testing.assert_frame_equal(pd_result, bf_result)


def test_drop_bigframes_multiindex(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
scalars_df = scalars_df.copy()
scalars_pandas_df = scalars_pandas_df.copy()
sub_df = scalars_df.iloc[[4, 1, 2]]
sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]]
sub_df = sub_df.set_index(["bytes_col", "numeric_col"])
sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"])
drop_index = sub_df.index
drop_pandas_index = sub_pandas_df.index

scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"])
scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"])
bf_result = scalars_df.drop(index=drop_index).to_pandas()
pd_result = scalars_pandas_df.drop(index=drop_pandas_index)

pd.testing.assert_frame_equal(pd_result, bf_result)


def test_drop_labels_axis_0(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

Expand Down