Skip to content

fix: Local data always has sequential index #1514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,7 @@ def __new__(
elif isinstance(data, series.Series) or isinstance(data, Index):
if isinstance(data, series.Series):
block = data._block
block = block.set_index(
col_ids=[data._value_column],
)
block = block.set_index(col_ids=[data._value_column])
elif isinstance(data, Index):
block = data._block
index = Index(data=block)
Expand Down Expand Up @@ -508,7 +506,10 @@ def to_pandas(
...

def to_pandas(
self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False
self,
*,
allow_large_results: Optional[bool] = None,
dry_run: bool = False,
) -> pandas.Index | pandas.Series:
"""Gets the Index as a pandas Index.

Expand Down
84 changes: 39 additions & 45 deletions bigframes/operations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import bigframes.core.identifiers as ids
import bigframes.core.indexes as indexes
import bigframes.core.scalar as scalars
import bigframes.core.utils as bf_utils
import bigframes.dtypes
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -69,40 +70,21 @@ def __init__(
raise ValueError(
f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}"
)

if isinstance(data, blocks.Block):
# Constructing from block is for internal use only - shouldn't use parameters, block encompasses all state
assert len(data.value_columns) == 1
assert len(data.column_labels) == 1
assert index is None
assert name is None
assert dtype is None
block = data

# interpret these cases as both index and data
elif isinstance(data, bigframes.pandas.Series) or pd.api.types.is_dict_like(
data
): # includes pd.Series
if isinstance(data, bigframes.pandas.Series):
data = data.copy()
if name is not None:
data.name = name
if dtype is not None:
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
data = data.astype(bf_dtype)
else: # local dict-like data
data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore
data_block = data._block
if index is not None:
# reindex
bf_index = indexes.Index(index, session=session)
idx_block = bf_index._block
idx_cols = idx_block.value_columns
block_idx, _ = idx_block.join(data_block, how="left")
data_block = block_idx.with_index_labels(bf_index.names)
block = data_block

# list-like data that will get default index
elif isinstance(data, indexes.Index) or pd.api.types.is_list_like(data):
elif isinstance(data, SeriesMethods):
block = data._get_block()
# special case where data is local scalar, but index is bigframes index (maybe very big)
elif (
not bf_utils.is_list_like(data) and not isinstance(data, indexes.Index)
) and isinstance(index, indexes.Index):
block = index._block
block, _ = block.create_constant(data)
block = block.with_column_labels([None])
# prevents no-op reindex later
index = None
elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index):
data = indexes.Index(data, dtype=dtype, name=name, session=session)
# set to none as it has already been applied, avoid re-cast later
if data.nlevels != 1:
Expand All @@ -111,8 +93,7 @@ def __init__(
data_block = data._block.reset_index(drop=False).with_column_labels(
data.names
)
if index is not None:
# Align by offset
if index is not None: # Align data and index by offset
bf_index = indexes.Index(index, session=session)
idx_block = bf_index._block.reset_index(
drop=False
Expand All @@ -121,19 +102,32 @@ def __init__(
data_block, (l_mapping, _) = idx_block.join(data_block, how="left")
data_block = data_block.set_index([l_mapping[col] for col in idx_cols])
data_block = data_block.with_index_labels(bf_index.names)
# prevents no-op reindex later
index = None
block = data_block

else: # Scalar case
if index is not None:
bf_index = indexes.Index(index, session=session)
else:
bf_index = indexes.Index(
[] if (data is None) else [0],
session=session,
dtype=bigframes.dtypes.INT_DTYPE,
)
block, _ = bf_index._block.create_constant(data, dtype)
block = block.with_column_labels([name])
if block:
assert len(block.value_columns) == 1
assert len(block.column_labels) == 1
if index is not None: # reindexing operation
bf_index = indexes.Index(index)
idx_block = bf_index._block
idx_cols = idx_block.index_columns
block, _ = idx_block.join(block, how="left")
block = block.with_index_labels(bf_index.names)
if name:
block = block.with_column_labels([name])
if dtype:
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
else:
pd_series = pd.Series(
data=data,
index=index, # type:ignore
dtype=dtype, # type:ignore
name=name,
)
block = read_pandas_func(pd_series)._get_block() # type:ignore

assert block is not None
self._block: blocks.Block = block
Expand Down
20 changes: 19 additions & 1 deletion tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def test_series_construct_reindex():

# BigQuery DataFrame default indices use nullable Int64 always
pd_result.index = pd_result.index.astype("Int64")

pd.testing.assert_series_equal(bf_result, pd_result)


Expand Down Expand Up @@ -201,6 +200,17 @@ def test_series_construct_nan():
pd.testing.assert_series_equal(bf_result, pd_result)


def test_series_construct_scalar_w_bf_index():
bf_result = series.Series(
"hello", index=bigframes.pandas.Index([1, 2, 3])
).to_pandas()
pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64"))

pd_result = pd_result.astype("string[pyarrow]")

pd.testing.assert_series_equal(bf_result, pd_result)


def test_series_construct_from_list_escaped_strings():
"""Check that special characters are supported."""
strings = [
Expand Down Expand Up @@ -303,6 +313,14 @@ def test_series_construct_w_dtype_for_array_struct():
)


def test_series_construct_local_unordered_has_sequential_index(unordered_session):
series = bigframes.pandas.Series(
["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session
)
expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype())
pd.testing.assert_index_equal(series.index.to_pandas(), expected)


def test_series_construct_w_dtype_for_json():
# Until b/401630655 is resolved, json, not compatible with allow_large_results=False
with bigframes.option_context("bigquery.allow_large_results", True):
Expand Down