Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,8 @@ def rank(
def dropna(
block: blocks.Block,
column_ids: typing.Sequence[str],
how: typing.Literal["all", "any"] = "any",
how: str = "any",
thresh: typing.Optional[int] = None,
subset: Optional[typing.Sequence[str]] = None,
):
"""
Expand All @@ -531,17 +532,38 @@ def dropna(
if subset is None:
subset = column_ids

# Predicates to check for non-null values in the subset of columns
predicates = [
ops.notnull_op.as_expr(column_id)
for column_id in column_ids
if column_id in subset
]

if len(predicates) == 0:
return block
if how == "any":
predicate = functools.reduce(ops.and_op.as_expr, predicates)
else: # "all"
predicate = functools.reduce(ops.or_op.as_expr, predicates)

if thresh is not None:
# Handle single predicate case
if len(predicates) == 1:
count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(predicates[0])
else:
# Sum the boolean expressions to count non-null values
count_expr = functools.reduce(
lambda a, b: ops.add_op.as_expr(
ops.AsTypeOp(pd.Int64Dtype()).as_expr(a),
ops.AsTypeOp(pd.Int64Dtype()).as_expr(b),
),
predicates,
)
# Filter rows where count >= thresh
predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
else:
# Only handle 'how' parameter when thresh is not specified
if how == "any":
predicate = functools.reduce(ops.and_op.as_expr, predicates)
else: # "all"
predicate = functools.reduce(ops.or_op.as_expr, predicates)

return block.filter(predicate)


Expand Down
54 changes: 41 additions & 13 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2802,6 +2802,7 @@ def dropna(
*,
axis: int | str = 0,
how: str = "any",
thresh: typing.Optional[int] = None,
subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
inplace: bool = False,
ignore_index=False,
Expand All @@ -2810,8 +2811,18 @@ def dropna(
raise NotImplementedError(
f"'inplace'=True not supported. {constants.FEEDBACK_LINK}"
)
if how not in ("any", "all"):
raise ValueError("'how' must be one of 'any', 'all'")

# Check if both thresh and how are explicitly provided
if thresh is not None:
# cannot specify both thresh and how parameters
if how != "any":
raise TypeError(
"You cannot set both the how and thresh arguments at the same time."
)
else:
# Only validate 'how' when thresh is not provided
if how not in ("any", "all"):
raise ValueError("'how' must be one of 'any', 'all'")

axis_n = utils.get_axis_number(axis)

Expand All @@ -2833,21 +2844,38 @@ def dropna(
for id_ in self._block.label_to_col_id[label]
]

result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore
result = block_ops.dropna(
self._block,
self._block.value_columns,
how=how,
thresh=thresh,
subset=subset_ids,
) # type: ignore
if ignore_index:
result = result.reset_index()
return DataFrame(result)
else:
isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
if how == "any":
null_locations = DataFrame(isnull_block).any().to_pandas()
else: # 'all'
null_locations = DataFrame(isnull_block).all().to_pandas()
keep_columns = [
col
for col, to_drop in zip(self._block.value_columns, null_locations)
if not to_drop
]
if thresh is not None:
# Keep columns with at least 'thresh' non-null values
notnull_block = self._block.multi_apply_unary_op(ops.notnull_op)
notnull_counts = DataFrame(notnull_block).sum().to_pandas()

keep_columns = [
col
for col, count in zip(self._block.value_columns, notnull_counts)
if count >= thresh
]
else:
isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
if how == "any":
null_locations = DataFrame(isnull_block).any().to_pandas()
else: # 'all'
null_locations = DataFrame(isnull_block).all().to_pandas()
keep_columns = [
col
for col, to_drop in zip(self._block.value_columns, null_locations)
if not to_drop
]
return DataFrame(self._block.select_columns(keep_columns))

def any(
Expand Down
32 changes: 31 additions & 1 deletion tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1207,7 +1207,7 @@ def test_assign_callable_lambda(scalars_dfs):
(1, "all", False, None),
],
)
def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
scalars_df, scalars_pandas_df = scalars_dfs
Expand All @@ -1222,6 +1222,36 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
pandas.testing.assert_frame_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("axis", "ignore_index", "subset", "thresh"),
[
(0, False, None, 2),
(0, True, None, 3),
(1, False, None, 2),
],
)
def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh):
"""
Tests that dropna correctly keeps rows/columns with a minimum number
of non-null values.
"""
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
scalars_df, scalars_pandas_df = scalars_dfs

df_result = scalars_df.dropna(
axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
)
pd_result = scalars_pandas_df.dropna(
axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
)

bf_result = df_result.to_pandas()
# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
pd.testing.assert_frame_equal(bf_result, pd_result)


def test_df_dropna_range_columns(scalars_dfs):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
Expand Down
26 changes: 25 additions & 1 deletion third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1762,6 +1762,7 @@ def dropna(
*,
axis: int | str = 0,
how: str = "any",
thresh: Optional[int] = None,
subset=None,
inplace: bool = False,
ignore_index=False,
Expand Down Expand Up @@ -1812,6 +1813,25 @@ def dropna(
<BLANKLINE>
[3 rows x 3 columns]

Keep rows with at least 2 non-null values.

>>> df.dropna(thresh=2)
name toy born
1 Batman Batmobile 1940-04-25
2 Catwoman Bullwhip <NA>
<BLANKLINE>
[2 rows x 3 columns]

Keep columns with at least 2 non-null values:

>>> df.dropna(axis='columns', thresh=2)
name toy
0 Alfred <NA>
1 Batman Batmobile
2 Catwoman Bullwhip
<BLANKLINE>
[3 rows x 2 columns]

Define in which columns to look for missing values.

>>> df.dropna(subset=['name', 'toy'])
Expand All @@ -1822,7 +1842,7 @@ def dropna(
[2 rows x 3 columns]

Args:
axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
axis ({0 or 'index', 1 or 'columns'}, default 0):
Determine if rows or columns which contain missing values are
removed.

Expand All @@ -1834,6 +1854,8 @@ def dropna(

* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
thresh (int, optional):
Require that many non-NA values. Cannot be combined with how.
subset (column label or sequence of labels, optional):
Labels along other axis to consider, e.g. if you are dropping
rows these would be a list of columns to include.
Expand All @@ -1851,6 +1873,8 @@ def dropna(
Raises:
ValueError:
If ``how`` is not one of ``any`` or ``all``.
TyperError:
If both ``how`` and ``thresh`` are specified.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

Expand Down