From 3fb58878a3f0cecd5e2e4cc8ec6c04276eb80dd8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 12 Sep 2024 21:20:15 +0000 Subject: [PATCH 1/7] feat: add `subset` parameter to `DataFrame.dropna` to select which columns to consider --- bigframes/core/block_transforms.py | 12 +++++++++-- bigframes/dataframe.py | 20 ++++++++++++++++-- tests/system/small/test_dataframe.py | 21 ++++++++++++------- tests/unit/test_dataframe.py | 9 ++++++++ .../bigframes_vendored/pandas/core/frame.py | 17 +++++++++++++++ 5 files changed, 67 insertions(+), 12 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index eaee2e2cc0..0b24a42e98 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,7 +15,7 @@ import functools import typing -from typing import Sequence +from typing import Optional, Sequence import pandas as pd @@ -488,11 +488,19 @@ def dropna( block: blocks.Block, column_ids: typing.Sequence[str], how: typing.Literal["all", "any"] = "any", + subset: Optional[typing.Sequence[str]] = None, ): """ Drop na entries from block """ - predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids] + if subset is None: + subset = column_ids + + predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] if len(predicates) == 0: return block if how == "any": diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2ae6aefe1b..9fc8179055 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2020,8 +2020,9 @@ def dropna( self, *, axis: int | str = 0, - inplace: bool = False, how: str = "any", + subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: if inplace: @@ -2033,8 +2034,23 @@ def dropna( axis_n = utils.get_axis_number(axis) + if subset is not None and axis_n != 0: + raise NotImplementedError( + f"subset only supported when axis=0. {constants.FEEDBACK_LINK}" + ) + if axis_n == 0: - result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore + subset = subset if utils.is_list_like(subset) else [subset] # type:ignore + + # subset needs to be converted into column IDs, not column labels. + if subset is None: + subset_ids = None + else: + subset_ids = [ + id for label in subset for id in self._block.label_to_col_id[label] + ] + + result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f51b597650..127d778531 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs): @skip_legacy_pandas @pytest.mark.parametrize( - ("axis", "how", "ignore_index"), + ("axis", "how", "ignore_index", "subset"), [ - (0, "any", False), - (0, "any", True), - (1, "any", False), - (1, "all", False), + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index): +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 6370d1b987..560c0cf0f4 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -20,6 +20,15 @@ from . import resources +def test_dataframe_dropna_axis_1_subset_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="subset"): + dataframe.dropna(axis=1, subset=["col1", "col2"]) + + def test_dataframe_repr_with_uninitialized_object(): """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index fe1c8a12ff..50df9fb44a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1614,6 +1614,8 @@ def dropna( *, axis: int | str = 0, how: str = "any", + subset=None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: """Remove missing values. @@ -1662,6 +1664,15 @@ def dropna( [3 rows x 3 columns] + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'toy']) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + [2 rows x 3 columns] + Args: axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Determine if rows or columns which contain missing values are @@ -1675,6 +1686,12 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. + subset (column label or sequence of labels, optional): + Labels along other axis to consider, e.g. if you are dropping + rows these would be a list of columns to include. + Only supports axis=0. + inplace (bool, default ``False``): + Not supported. ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. From bf8286256165f48ea69a5b207da5b1b79097e9f7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 13 Sep 2024 18:37:58 +0000 Subject: [PATCH 2/7] fix dropna with subset=None --- bigframes/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9fc8179055..8cf0847ace 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2040,11 +2040,11 @@ def dropna( ) if axis_n == 0: - subset = subset if utils.is_list_like(subset) else [subset] # type:ignore - # subset needs to be converted into column IDs, not column labels. if subset is None: subset_ids = None + elif not utils.is_list_like(subset): + subset_ids = [self._block.label_to_col_id[subset]] else: subset_ids = [ id for label in subset for id in self._block.label_to_col_id[label] From 197074abc53c5a85c66ec1940e54ea3f2c9c3677 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 13 Sep 2024 19:06:15 +0000 Subject: [PATCH 3/7] refactor: remove circular dependencies preventing local doctest runs With this change I can once again run ``` pytest --doctest-modules third_party/bigframes_vendored/pandas/core/frame.py ``` Note: having multiple `version.py` files should be fine. release-please will update all such files it finds. --- bigframes/bigquery/__init__.py | 3 ++- bigframes/constants.py | 8 -------- bigframes/core/block_transforms.py | 5 +++-- bigframes/core/blocks.py | 2 +- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/core/compile/ibis_types.py | 2 +- bigframes/core/compile/scalar_op_compiler.py | 2 +- bigframes/core/groupby/__init__.py | 2 +- bigframes/core/indexers.py | 2 +- bigframes/core/indexes/base.py | 2 +- bigframes/core/reshape/__init__.py | 2 +- bigframes/core/tools/datetimes.py | 2 +- bigframes/core/validations.py | 7 ++++--- bigframes/dataframe.py | 4 ++-- bigframes/dtypes.py | 3 +-- bigframes/formatting_helpers.py | 3 +-- bigframes/functions/_remote_function_client.py | 2 +- bigframes/functions/_remote_function_session.py | 3 ++- bigframes/functions/remote_function.py | 2 +- bigframes/ml/compose.py | 2 +- bigframes/ml/linear_model.py | 2 +- bigframes/ml/llm.py | 3 ++- bigframes/ml/loader.py | 2 +- bigframes/ml/metrics/_metrics.py | 2 +- bigframes/ml/pipeline.py | 2 +- bigframes/ml/sql.py | 3 +-- bigframes/ml/utils.py | 2 +- bigframes/operations/_matplotlib/core.py | 2 +- bigframes/operations/_matplotlib/hist.py | 2 +- bigframes/operations/base.py | 2 +- bigframes/operations/plotting.py | 2 +- bigframes/operations/strings.py | 2 +- bigframes/pandas/__init__.py | 2 +- bigframes/series.py | 2 +- bigframes/session/__init__.py | 2 +- bigframes/session/_io/bigquery/read_gbq_table.py | 4 ++-- bigframes/session/_io/pandas.py | 4 ++-- bigframes/session/loader.py | 6 ++++-- tests/unit/test_constants.py | 9 ++++++--- tests/unit/test_formatting_helpers.py | 5 +++-- third_party/bigframes_vendored/constants.py | 6 ++---- third_party/bigframes_vendored/version.py | 15 +++++++++++++++ 42 files changed, 77 insertions(+), 64 deletions(-) create mode 100644 third_party/bigframes_vendored/version.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 1e8e8d578d..303120b88a 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -23,7 +23,8 @@ import typing from typing import Literal, Optional, Union -import bigframes.constants as constants +import bigframes_vendored.constants as constants + import bigframes.core.groupby as groupby import bigframes.core.sql import bigframes.ml.utils as utils diff --git a/bigframes/constants.py b/bigframes/constants.py index d6fe699713..4d5b6b8eb3 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -19,14 +19,6 @@ import datetime -import bigframes_vendored.constants - -BF_VERSION = bigframes_vendored.constants.BF_VERSION -FEEDBACK_LINK = bigframes_vendored.constants.FEEDBACK_LINK -ABSTRACT_METHOD_ERROR_MESSAGE = ( - bigframes_vendored.constants.ABSTRACT_METHOD_ERROR_MESSAGE -) - DEFAULT_EXPIRATION = datetime.timedelta(days=7) # https://cloud.google.com/bigquery/docs/locations diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index eaee2e2cc0..eaac0dc785 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -17,9 +17,10 @@ import typing from typing import Sequence +import bigframes_vendored.constants as constants import pandas as pd -import bigframes.constants as constants +import bigframes.constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.expression as ex @@ -117,7 +118,7 @@ def quantile( ) quantile_cols = [] labels = [] - if len(columns) * len(qs) > constants.MAX_COLUMNS: + if len(columns) * len(qs) > bigframes.constants.MAX_COLUMNS: raise NotImplementedError("Too many aggregates requested.") for col in columns: for q in qs: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index feeed78885..3e97b1cb5e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -32,13 +32,13 @@ from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings +import bigframes_vendored.constants as constants import google.cloud.bigquery as bigquery import pandas as pd import pyarrow as pa import bigframes._config.sampling_options as sampling_options import bigframes.constants -import bigframes.constants as constants import bigframes.core as core import bigframes.core.compile.googlesql as googlesql import bigframes.core.expression as ex diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 58973b10eb..488acd63db 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -15,13 +15,13 @@ import typing from typing import cast, Optional +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas as pd -import bigframes.constants as constants import bigframes.core.compile.ibis_types as compile_ibis_types import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index f4ec295d5f..77bfb84425 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -17,6 +17,7 @@ from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union import warnings +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import geopandas as gpd # type: ignore @@ -29,7 +30,6 @@ import pandas as pd import pyarrow as pa -import bigframes.constants as constants import bigframes.dtypes # Type hints for Ibis data types supported by BigQuery DataFrame diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 9e18b391d6..c0f12865d6 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -17,6 +17,7 @@ import functools import typing +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.common.exceptions @@ -26,7 +27,6 @@ import numpy as np import pandas as pd -import bigframes.constants as constants import bigframes.core.compile.ibis_types import bigframes.core.expression as ex import bigframes.dtypes diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2b80d0389e..fb782b780b 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -17,10 +17,10 @@ import typing from typing import Sequence, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby import pandas as pd -import bigframes.constants as constants from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.block_transforms as block_ops diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index dae5eada70..06d9c4bbab 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -17,10 +17,10 @@ import typing from typing import Tuple, Union +import bigframes_vendored.constants as constants import ibis import pandas as pd -import bigframes.constants as constants import bigframes.core.blocks import bigframes.core.expression as ex import bigframes.core.guid as guid diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0376e37f96..017702b85a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -19,12 +19,12 @@ import typing from typing import Hashable, Optional, Sequence, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index import google.cloud.bigquery as bigquery import numpy as np import pandas -import bigframes.constants as constants import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression as ex diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index a23461bdb9..49ecedcc87 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -16,9 +16,9 @@ import typing from typing import Iterable, Literal, Optional, Union +import bigframes_vendored.constants as constants import pandas as pd -import bigframes.constants as constants import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 5d8d8c9685..2abb86a2f3 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -16,10 +16,10 @@ from datetime import datetime from typing import Optional, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes import pandas as pd -import bigframes.constants as constants import bigframes.dataframe import bigframes.dtypes import bigframes.operations as ops diff --git a/bigframes/core/validations.py b/bigframes/core/validations.py index ca65445dab..701752c9fc 100644 --- a/bigframes/core/validations.py +++ b/bigframes/core/validations.py @@ -19,7 +19,8 @@ import functools from typing import Optional, Protocol, TYPE_CHECKING, Union -import bigframes.constants +import bigframes_vendored.constants as constants + import bigframes.exceptions if TYPE_CHECKING: @@ -72,9 +73,9 @@ def enforce_ordered( if not session._allows_ambiguity: suggestion_substr = suggestion + " " if suggestion else "" raise bigframes.exceptions.OrderRequiredError( - f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{bigframes.constants.FEEDBACK_LINK}" + f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{constants.FEEDBACK_LINK}" ) if not object._block.explicitly_ordered: raise bigframes.exceptions.OrderRequiredError( - f"Op {opname} requires an ordering. Use .sort_values or .sort_index to provide an ordering. {bigframes.constants.FEEDBACK_LINK}" + f"Op {opname} requires an ordering. Use .sort_values or .sort_index to provide an ordering. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2ae6aefe1b..d7bdd5529b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -36,6 +36,7 @@ ) import warnings +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.frame as vendored_pandas_frame import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import google.api_core.exceptions @@ -49,7 +50,6 @@ import bigframes import bigframes._config.display_options as display_options import bigframes.constants -import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops @@ -3106,7 +3106,7 @@ def to_gbq( self._session.bqclient, temp_table_ref, datetime.datetime.now(datetime.timezone.utc) - + constants.DEFAULT_EXPIRATION, + + bigframes.constants.DEFAULT_EXPIRATION, ) if len(labels) != 0: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index bfed783e1e..3cd2507231 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -20,14 +20,13 @@ import typing from typing import Dict, Literal, Union +import bigframes_vendored.constants as constants import geopandas as gpd # type: ignore import google.cloud.bigquery import numpy as np import pandas as pd import pyarrow as pa -import bigframes.constants as constants - # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ pd.BooleanDtype, diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 752aeb7a10..de0ae8cc68 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -19,6 +19,7 @@ import random from typing import Any, Optional, Union +import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions import google.cloud.bigquery as bigquery import humanize @@ -26,8 +27,6 @@ import IPython.display as display import ipywidgets as widgets -import bigframes.constants as constants - GenericJob = Union[ bigquery.LoadJob, bigquery.ExtractJob, bigquery.QueryJob, bigquery.CopyJob ] diff --git a/bigframes/functions/_remote_function_client.py b/bigframes/functions/_remote_function_client.py index 3698bda28b..75385f11a5 100644 --- a/bigframes/functions/_remote_function_client.py +++ b/bigframes/functions/_remote_function_client.py @@ -25,9 +25,9 @@ import tempfile from typing import cast, Tuple, TYPE_CHECKING +from bigframes_vendored import constants import requests -from bigframes import constants import bigframes.functions.remote_function_template if TYPE_CHECKING: diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py index c947fcdc63..0510980178 100644 --- a/bigframes/functions/_remote_function_session.py +++ b/bigframes/functions/_remote_function_session.py @@ -22,6 +22,7 @@ from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union import warnings +import bigframes_vendored.constants as constants import cloudpickle import google.api_core.exceptions from google.cloud import ( @@ -31,7 +32,7 @@ resourcemanager_v3, ) -from bigframes import clients, constants +from bigframes import clients if TYPE_CHECKING: from bigframes.session import Session diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index b4c74e90d6..4dc6c1ad6b 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -24,12 +24,12 @@ if TYPE_CHECKING: from bigframes.session import Session +import bigframes_vendored.constants as constants import google.api_core.exceptions import google.api_core.retry from google.cloud import bigquery import google.iam.v1 -import bigframes.constants as constants import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.functions.remote_function_template diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 4ea63d2e81..3cfa1851f5 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -23,10 +23,10 @@ import typing from typing import cast, Iterable, List, Optional, Set, Tuple, Union +from bigframes_vendored import constants import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery -from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, impute, preprocessing, utils import bigframes.pandas as bpd diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 0816ef9b24..8fe1d6ec27 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -19,12 +19,12 @@ from typing import Dict, List, Literal, Optional, Union +import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.linear_model._base import bigframes_vendored.sklearn.linear_model._logistic from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index a3cd065a55..53a9d40c6e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -19,10 +19,11 @@ from typing import cast, Literal, Optional, Union import warnings +import bigframes_vendored.constants as constants from google.cloud import bigquery import bigframes -from bigframes import clients, constants +from bigframes import clients from bigframes.core import blocks, log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 4e7e808260..de9681660e 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -17,10 +17,10 @@ from types import MappingProxyType from typing import Union +import bigframes_vendored.constants as constants from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.ml import ( cluster, compose, diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index a40c175000..3c2d6514ae 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -19,6 +19,7 @@ import typing from typing import Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.metrics._classification as vendored_metrics_classification import bigframes_vendored.sklearn.metrics._ranking as vendored_metrics_ranking import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression @@ -26,7 +27,6 @@ import pandas as pd import sklearn.metrics as sklearn_metrics # type: ignore -import bigframes.constants as constants from bigframes.ml import utils import bigframes.pandas as bpd diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 4cd60c5836..dc3bd1f3f4 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -20,11 +20,11 @@ from typing import List, Optional, Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.pipeline from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.core import log_adapter from bigframes.ml import ( base, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index d14627f590..7120a5a5fd 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -18,10 +18,9 @@ from typing import Iterable, Literal, Mapping, Optional, Union +import bigframes_vendored.constants as constants import google.cloud.bigquery -import bigframes.constants as constants - # TODO: Add proper escaping logic from core/compile module class BaseSqlGenerator: diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 75dfb916f6..d754b1d002 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -15,9 +15,9 @@ import typing from typing import Any, Iterable, Literal, Mapping, Optional, Union +import bigframes_vendored.constants as constants from google.cloud import bigquery -import bigframes.constants as constants from bigframes.core import blocks import bigframes.pandas as bpd diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index ff8dd86cff..9e59e09877 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -15,9 +15,9 @@ import abc import typing +import bigframes_vendored.constants as constants import pandas as pd -import bigframes.constants as constants import bigframes.dtypes as dtypes DEFAULT_SAMPLING_N = 1000 diff --git a/bigframes/operations/_matplotlib/hist.py b/bigframes/operations/_matplotlib/hist.py index 720b94d7da..213e2abd77 100644 --- a/bigframes/operations/_matplotlib/hist.py +++ b/bigframes/operations/_matplotlib/hist.py @@ -15,10 +15,10 @@ import itertools from typing import Literal +import bigframes_vendored.constants as constants import numpy as np import pandas as pd -import bigframes.constants as constants import bigframes.operations._matplotlib.core as bfplt diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 2f87045415..68f46baded 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -17,10 +17,10 @@ import typing from typing import List, Sequence +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import pandas as pd -import bigframes.constants as constants import bigframes.core.blocks as blocks import bigframes.core.convert import bigframes.core.expression as ex diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index ff74806993..a45b825354 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -14,9 +14,9 @@ import typing +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.plotting._core as vendordt -import bigframes.constants as constants import bigframes.operations._matplotlib as bfplt diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 4af142e0d5..2e40115985 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -17,9 +17,9 @@ import re from typing import cast, Literal, Optional, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr -import bigframes.constants as constants from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 3809384c95..94ea6becab 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -36,6 +36,7 @@ Union, ) +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat import bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge @@ -53,7 +54,6 @@ ) import bigframes._config as config -import bigframes.constants as constants import bigframes.core.blocks import bigframes.core.expression as ex import bigframes.core.global_session as global_session diff --git a/bigframes/series.py b/bigframes/series.py index d9e3bb19dd..d4cb1d3700 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -24,6 +24,7 @@ import typing from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery import numpy @@ -31,7 +32,6 @@ import pandas.core.dtypes.common import typing_extensions -import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 045483bd53..7d0cfaee5c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -37,6 +37,7 @@ import warnings import weakref +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.backends.bigquery # noqa import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet @@ -58,7 +59,6 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.clients -import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 03b26f9460..7585dd3f45 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -23,12 +23,12 @@ from typing import Dict, Iterable, List, Optional, Sequence, Tuple import warnings +import bigframes_vendored.constants as constants import google.api_core.exceptions import google.cloud.bigquery as bigquery import bigframes import bigframes.clients -import bigframes.constants import bigframes.core.compile import bigframes.core.compile.default_ordering import bigframes.core.sql @@ -241,7 +241,7 @@ def get_index_cols( # test, as it's not possible to subclass enums in Python. See: # https://stackoverflow.com/a/33680021/101923 raise NotImplementedError( - f"Got unexpected index_col {repr(index_col)}. {bigframes.constants.FEEDBACK_LINK}" + f"Got unexpected index_col {repr(index_col)}. {constants.FEEDBACK_LINK}" ) elif isinstance(index_col, str): index_cols: List[str] = [index_col] diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 789426a6e3..83e30fd900 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -14,6 +14,7 @@ from typing import Dict, Union +import bigframes_vendored.constants as constants import geopandas # type: ignore import pandas import pandas.arrays @@ -21,7 +22,6 @@ import pyarrow.compute # type: ignore import pyarrow.types # type: ignore -import bigframes.constants import bigframes.features @@ -54,7 +54,7 @@ def arrow_to_pandas( if len(dtypes) != arrow_table.num_columns: raise ValueError( f"Number of types {len(dtypes)} doesn't match number of columns " - f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}" + f"{arrow_table.num_columns}. {constants.FEEDBACK_LINK}" ) serieses = {} diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 924fddce12..ce9874e35f 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -22,6 +22,7 @@ import typing from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions import google.auth.credentials @@ -36,7 +37,7 @@ import pandas import bigframes.clients -import bigframes.constants as constants +import bigframes.constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile @@ -444,7 +445,8 @@ def _read_bigquery_load_job( # hours of the anonymous dataset. table_expiration = bigquery.Table(table_id) table_expiration.expires = ( - datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + datetime.datetime.now(datetime.timezone.utc) + + bigframes.constants.DEFAULT_EXPIRATION ) self._bqclient.update_table(table_expiration, ["expires"]) diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py index aabc09c388..4e11419077 100644 --- a/tests/unit/test_constants.py +++ b/tests/unit/test_constants.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.constants as constants +import bigframes_vendored.constants + +import bigframes.version def test_feedback_link_includes_version(): - assert len(constants.BF_VERSION) > 0 - assert constants.BF_VERSION in constants.FEEDBACK_LINK + version = bigframes.version.__version__ + assert len(version) > 0 + assert version in bigframes_vendored.constants.FEEDBACK_LINK diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index 3c966752c9..d4fe039484 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -14,12 +14,13 @@ import unittest.mock as mock +import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions import google.cloud.bigquery as bigquery import pytest -import bigframes.constants as constants import bigframes.formatting_helpers as formatting_helpers +import bigframes.version def test_wait_for_query_job_error_includes_feedback_link(): @@ -54,4 +55,4 @@ def test_wait_for_job_error_includes_version(): formatting_helpers.wait_for_job(mock_job) cap_exc.match("Test message 123.") - cap_exc.match(constants.BF_VERSION) + cap_exc.match(bigframes.version.__version__) diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index 91084b38f9..1effdffcbe 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -16,14 +16,12 @@ This module should not depend on any others in the package. """ -import bigframes.version - -BF_VERSION = bigframes.version.__version__ +import bigframes_vendored.version FEEDBACK_LINK = ( "Share your usecase with the BigQuery DataFrames team at the " "https://bit.ly/bigframes-feedback survey." - f"You are currently running BigFrames version {BF_VERSION}" + f"You are currently running BigFrames version {bigframes_vendored.version.__version__}" ) ABSTRACT_METHOD_ERROR_MESSAGE = ( diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py new file mode 100644 index 0000000000..2c0c6e4d3a --- /dev/null +++ b/third_party/bigframes_vendored/version.py @@ -0,0 +1,15 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "1.17.0" From 6f8f128db72da6a3b28e1254728698f70ecdeb24 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 13 Sep 2024 19:49:23 +0000 Subject: [PATCH 4/7] fix doctest --- third_party/bigframes_vendored/pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 50df9fb44a..70e1b0d04d 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1667,9 +1667,9 @@ def dropna( Define in which columns to look for missing values. >>> df.dropna(subset=['name', 'toy']) - name toy born - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip [2 rows x 3 columns] From 0f18294513a2a6a806f5fff4ff38646d4029a884 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 13 Sep 2024 19:51:35 +0000 Subject: [PATCH 5/7] Revert "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 57e8335af83558323d4df213854efc1edcdb35f0, reversing changes made to 197074abc53c5a85c66ec1940e54ea3f2c9c3677. --- bigframes/core/block_transforms.py | 12 ++--------- bigframes/dataframe.py | 20 ++---------------- tests/system/small/test_dataframe.py | 21 +++++++------------ tests/unit/test_dataframe.py | 9 -------- .../bigframes_vendored/pandas/core/frame.py | 8 ------- 5 files changed, 12 insertions(+), 58 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index bf35231154..eaac0dc785 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,7 +15,7 @@ import functools import typing -from typing import Optional, Sequence +from typing import Sequence import bigframes_vendored.constants as constants import pandas as pd @@ -489,19 +489,11 @@ def dropna( block: blocks.Block, column_ids: typing.Sequence[str], how: typing.Literal["all", "any"] = "any", - subset: Optional[typing.Sequence[str]] = None, ): """ Drop na entries from block """ - if subset is None: - subset = column_ids - - predicates = [ - ops.notnull_op.as_expr(column_id) - for column_id in column_ids - if column_id in subset - ] + predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids] if len(predicates) == 0: return block if how == "any": diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 43265cdd7e..d7bdd5529b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2020,9 +2020,8 @@ def dropna( self, *, axis: int | str = 0, - how: str = "any", - subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, inplace: bool = False, + how: str = "any", ignore_index=False, ) -> DataFrame: if inplace: @@ -2034,23 +2033,8 @@ def dropna( axis_n = utils.get_axis_number(axis) - if subset is not None and axis_n != 0: - raise NotImplementedError( - f"subset only supported when axis=0. {constants.FEEDBACK_LINK}" - ) - if axis_n == 0: - # subset needs to be converted into column IDs, not column labels. - if subset is None: - subset_ids = None - elif not utils.is_list_like(subset): - subset_ids = [self._block.label_to_col_id[subset]] - else: - subset_ids = [ - id for label in subset for id in self._block.label_to_col_id[label] - ] - - result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore + result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 127d778531..f51b597650 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -936,24 +936,19 @@ def test_assign_callable_lambda(scalars_dfs): @skip_legacy_pandas @pytest.mark.parametrize( - ("axis", "how", "ignore_index", "subset"), + ("axis", "how", "ignore_index"), [ - (0, "any", False, None), - (0, "any", True, None), - (0, "all", False, ["bool_col", "time_col"]), - (0, "any", False, ["bool_col", "time_col"]), - (0, "all", False, "time_col"), - (1, "any", False, None), - (1, "all", False, None), + (0, "any", False), + (0, "any", True), + (1, "any", False), + (1, "all", False), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): +def test_df_dropna(scalars_dfs, axis, how, ignore_index): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna( - axis=axis, how=how, ignore_index=ignore_index, subset=subset - ) + pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 560c0cf0f4..6370d1b987 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -20,15 +20,6 @@ from . import resources -def test_dataframe_dropna_axis_1_subset_not_implememented( - monkeypatch: pytest.MonkeyPatch, -): - dataframe = resources.create_dataframe(monkeypatch) - - with pytest.raises(NotImplementedError, match="subset"): - dataframe.dropna(axis=1, subset=["col1", "col2"]) - - def test_dataframe_repr_with_uninitialized_object(): """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 70e1b0d04d..a3719d91f6 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1614,8 +1614,6 @@ def dropna( *, axis: int | str = 0, how: str = "any", - subset=None, - inplace: bool = False, ignore_index=False, ) -> DataFrame: """Remove missing values. @@ -1686,12 +1684,6 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. - subset (column label or sequence of labels, optional): - Labels along other axis to consider, e.g. if you are dropping - rows these would be a list of columns to include. - Only supports axis=0. - inplace (bool, default ``False``): - Not supported. ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. From 80a827edcffb4e7a902f70f0c6c9e8afc45b2198 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 13 Sep 2024 19:52:49 +0000 Subject: [PATCH 6/7] Reapply "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 0f18294513a2a6a806f5fff4ff38646d4029a884. --- bigframes/core/block_transforms.py | 12 +++++++++-- bigframes/dataframe.py | 20 ++++++++++++++++-- tests/system/small/test_dataframe.py | 21 ++++++++++++------- tests/unit/test_dataframe.py | 9 ++++++++ .../bigframes_vendored/pandas/core/frame.py | 8 +++++++ 5 files changed, 58 insertions(+), 12 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index eaac0dc785..bf35231154 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,7 +15,7 @@ import functools import typing -from typing import Sequence +from typing import Optional, Sequence import bigframes_vendored.constants as constants import pandas as pd @@ -489,11 +489,19 @@ def dropna( block: blocks.Block, column_ids: typing.Sequence[str], how: typing.Literal["all", "any"] = "any", + subset: Optional[typing.Sequence[str]] = None, ): """ Drop na entries from block """ - predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids] + if subset is None: + subset = column_ids + + predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] if len(predicates) == 0: return block if how == "any": diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d7bdd5529b..43265cdd7e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2020,8 +2020,9 @@ def dropna( self, *, axis: int | str = 0, - inplace: bool = False, how: str = "any", + subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: if inplace: @@ -2033,8 +2034,23 @@ def dropna( axis_n = utils.get_axis_number(axis) + if subset is not None and axis_n != 0: + raise NotImplementedError( + f"subset only supported when axis=0. {constants.FEEDBACK_LINK}" + ) + if axis_n == 0: - result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore + # subset needs to be converted into column IDs, not column labels. + if subset is None: + subset_ids = None + elif not utils.is_list_like(subset): + subset_ids = [self._block.label_to_col_id[subset]] + else: + subset_ids = [ + id for label in subset for id in self._block.label_to_col_id[label] + ] + + result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f51b597650..127d778531 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs): @skip_legacy_pandas @pytest.mark.parametrize( - ("axis", "how", "ignore_index"), + ("axis", "how", "ignore_index", "subset"), [ - (0, "any", False), - (0, "any", True), - (1, "any", False), - (1, "all", False), + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index): +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 6370d1b987..560c0cf0f4 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -20,6 +20,15 @@ from . import resources +def test_dataframe_dropna_axis_1_subset_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="subset"): + dataframe.dropna(axis=1, subset=["col1", "col2"]) + + def test_dataframe_repr_with_uninitialized_object(): """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index a3719d91f6..70e1b0d04d 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1614,6 +1614,8 @@ def dropna( *, axis: int | str = 0, how: str = "any", + subset=None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: """Remove missing values. @@ -1684,6 +1686,12 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. + subset (column label or sequence of labels, optional): + Labels along other axis to consider, e.g. if you are dropping + rows these would be a list of columns to include. + Only supports axis=0. + inplace (bool, default ``False``): + Not supported. ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. From a0d7c8c4d4a7941e6fd1f2c141aa90f3a91bfc46 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 16 Sep 2024 17:31:59 +0000 Subject: [PATCH 7/7] loop over tuple result --- bigframes/dataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 43265cdd7e..797cf54f2d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2044,10 +2044,12 @@ def dropna( if subset is None: subset_ids = None elif not utils.is_list_like(subset): - subset_ids = [self._block.label_to_col_id[subset]] + subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]] else: subset_ids = [ - id for label in subset for id in self._block.label_to_col_id[label] + id_ + for label in subset + for id_ in self._block.label_to_col_id[label] ] result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore