Skip to content

fix: make DataFrame __getattr__ and __setattr__ more robust to subclassing #1352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 38 additions & 19 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ def __init__(
):
global bigframes

self._query_job: Optional[bigquery.QueryJob] = None

if copy is not None and not copy:
raise ValueError(
f"DataFrame constructor only supports copy=True. {constants.FEEDBACK_LINK}"
Expand Down Expand Up @@ -182,7 +184,6 @@ def __init__(
if dtype:
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
self._block = block

else:
import bigframes.pandas
Expand All @@ -194,10 +195,14 @@ def __init__(
dtype=dtype, # type:ignore
)
if session:
self._block = session.read_pandas(pd_dataframe)._get_block()
block = session.read_pandas(pd_dataframe)._get_block()
else:
self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
self._query_job: Optional[bigquery.QueryJob] = None
block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()

# We use _block as an indicator in __getattr__ and __setattr__ to see
# if the object is fully initialized, so make sure we set the _block
# attribute last.
self._block = block
self._block.session._register_object(self)

def __dir__(self):
Expand Down Expand Up @@ -625,13 +630,17 @@ def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame:
return DataFrame(block)

def __getattr__(self, key: str):
# Protect against recursion errors with uninitialized DataFrame
# objects. See:
# To allow subclasses to set private attributes before the class is
# fully initialized, protect against recursion errors with
# uninitialized DataFrame objects. Note: this comes at the downside
# that columns with a leading `_` won't be treated as columns.
#
# See:
# https://github.com/googleapis/python-bigquery-dataframes/issues/728
# and
# https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html
if key == "_block":
raise AttributeError("_block")
raise AttributeError(key)

if key in self._block.column_labels:
return self.__getitem__(key)
Expand All @@ -651,26 +660,36 @@ def __getattr__(self, key: str):
raise AttributeError(key)

def __setattr__(self, key: str, value):
if key in ["_block", "_query_job"]:
if key == "_block":
object.__setattr__(self, key, value)
return

# To allow subclasses to set private attributes before the class is
# fully initialized, assume anything set before `_block` is initialized
# is a regular attribute.
if not hasattr(self, "_block"):
object.__setattr__(self, key, value)
return
# Can this be removed???

# If someone has a column named the same as a normal attribute
# (e.g. index), we want to set the normal attribute, not the column.
# To do that, check if there is a normal attribute by using
# __getattribute__ (not __getattr__, because that includes columns).
# If that returns a value without raising, then we know this is a
# normal attribute and we should prefer that.
try:
# boring attributes go through boring old path
object.__getattribute__(self, key)
return object.__setattr__(self, key, value)
except AttributeError:
pass

# if this fails, go on to more involved attribute setting
# (note that this matches __getattr__, above).
try:
if key in self.columns:
self[key] = value
else:
object.__setattr__(self, key, value)
# Can this be removed?
except (AttributeError, TypeError):
# If we made it here, then we know that it's not a regular attribute
# already, so it might be a column to update. Note: we don't allow
# adding new columns using __setattr__, only __setitem__, that way we
# can still add regular new attributes.
if key in self._block.column_labels:
self[key] = value
else:
object.__setattr__(self, key, value)

def __repr__(self) -> str:
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ def test_dataframe_repr_with_uninitialized_object():
assert "DataFrame" in got


def test_dataframe_setattr_with_uninitialized_object():
"""Ensures DataFrame can be subclassed without trying to set attributes as columns."""
# Avoid calling __init__ since it might be called later in a subclass.
# https://stackoverflow.com/a/6384982/101923
dataframe = bigframes.dataframe.DataFrame.__new__(bigframes.dataframe.DataFrame)
dataframe.lineage = "my-test-value"
assert dataframe.lineage == "my-test-value" # Should just be a regular attribute.


def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch):
dataframe = resources.create_dataframe(monkeypatch)

Expand Down