From 31d0b35b36bde232c98c8a5631e516cc4d37968e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 24 Jul 2025 21:26:05 +0000 Subject: [PATCH 01/15] remove expensive len() call --- bigframes/display/anywidget.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 5a20ddcb7f..9e70293816 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -45,8 +45,10 @@ class TableWidget(WIDGET_BASE): - """ - An interactive, paginated table widget for BigFrames DataFrames. + """An interactive, paginated table widget for BigFrames DataFrames. + + This widget provides a user-friendly way to display and navigate through + large BigQuery DataFrames within a Jupyter environment. """ def __init__(self, dataframe: bigframes.dataframe.DataFrame): @@ -75,15 +77,18 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Initialize data fetching attributes. self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) + # Access total_rows through type casting (internal use only) + from bigframes.core.blocks import PandasBatches + + if isinstance(self._batches, PandasBatches): + self.row_count = self._batches.total_rows or 0 + else: + # Fallback for compatibility + self.row_count = 0 + # set traitlets properties that trigger observers self.page_size = initial_page_size - # len(dataframe) is expensive, since it will trigger a - # SELECT COUNT(*) query. It is a must have however. - # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` - # before we get here so that the count might already be cached. - self.row_count = len(dataframe) - # get the initial page self._set_table_html() From f7eca6b7cd890205af429099dabdd18054502c57 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 24 Jul 2025 23:21:12 +0000 Subject: [PATCH 02/15] add testcase --- notebooks/dataframes/anywidget_mode.ipynb | 20 +++- tests/system/small/test_anywidget.py | 116 ++++++++++++++++++++-- 2 files changed, 126 insertions(+), 10 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 617329ba65..8bffb23df0 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -76,7 +76,7 @@ { "data": { "text/html": [ - "Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. Open Job" + "Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -139,10 +139,22 @@ "id": "ce250157", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635", + "model_id": "e74c3920b93644a0b2afdaa3841cad31", "version_major": 2, "version_minor": 1 }, @@ -193,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "121e3d2f28004036a922e3a11a08d4b7", + "model_id": "b4f7a3f86ef54e07b24ef10061088391", "version_major": 2, "version_minor": 1 }, @@ -287,7 +299,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5ed335bbbc064e5391ea06a9a218642e", + "model_id": "44a829aca2f24cfdba4b61afd1a259fe", "version_major": 2, "version_minor": 1 }, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 8a91176dd9..fbb8851ef9 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + import pandas as pd import pytest @@ -61,11 +63,12 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame): Helper fixture to create a TableWidget instance with a fixed page size. This reduces duplication across tests that use the same widget configuration. """ - from bigframes import display + + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): # Delay context manager cleanup of `max_rows` until after tests finish. - yield display.TableWidget(paginated_bf_df) + yield TableWidget(paginated_bf_df) @pytest.fixture(scope="module") @@ -90,10 +93,10 @@ def small_bf_df( @pytest.fixture def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" - from bigframes import display + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): - yield display.TableWidget(small_bf_df) + yield TableWidget(small_bf_df) @pytest.fixture(scope="module") @@ -135,10 +138,10 @@ def test_widget_initialization_should_calculate_total_row_count( paginated_bf_df: bf.dataframe.DataFrame, ): """A TableWidget should correctly calculate the total row count on creation.""" - from bigframes import display + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = display.TableWidget(paginated_bf_df) + widget = TableWidget(paginated_bf_df) assert widget.row_count == EXPECTED_ROW_COUNT @@ -436,6 +439,107 @@ def test_widget_creation_should_load_css_for_rendering(table_widget): assert ".bigframes-widget .footer" in css_content +def test_widget_row_count_should_be_immutable_after_creation( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Given a widget created with a specific configuration when global display + options are changed later, the widget's original row_count should remain + unchanged. + """ + from bigframes.display import TableWidget + + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + initial_row_count = widget.row_count + assert initial_row_count == EXPECTED_ROW_COUNT + + # Change a global option that could influence row count + bf.options.display.max_rows = 10 + + # The widget's row count was fixed at creation and should not change. + assert widget.row_count == initial_row_count + + +def test_widget_should_fallback_to_zero_rows_when_total_rows_is_none( + paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch +): + """ + Given an internal component that fails to provide a total row count, + when the widget is created, the row_count should safely fall back to 0. + """ + from bigframes.core.blocks import PandasBatches + + # Simulate an internal failure where total_rows returns None + monkeypatch.setattr(PandasBatches, "total_rows", property(lambda self: None)) + + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display import TableWidget + + widget = TableWidget(paginated_bf_df) + + assert widget.row_count == 0 + + +def test_widget_should_fallback_to_zero_rows_when_batches_are_invalid_type( + paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch +): + """ + Given an internal component that returns an unexpected data type, + when the widget is created, the row_count should safely fall back to 0. + """ + # Simulate internal method returning an unexpected type (a simple iterator) + def mock_to_pandas_batches(self, **kwargs): + return iter([paginated_bf_df.to_pandas().iloc[:2]]) + + monkeypatch.setattr( + "bigframes.dataframe.DataFrame.to_pandas_batches", mock_to_pandas_batches + ) + + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display import TableWidget + + widget = TableWidget(paginated_bf_df) + + assert widget.row_count == 0 + + +@pytest.mark.parametrize( + "max_results, expected_rows", + [ + (None, EXPECTED_ROW_COUNT), + (3, 3), + (10, EXPECTED_ROW_COUNT), + ], + ids=["no_limit", "limit_is_less_than_total", "limit_is_greater_than_total"], +) +def test_widget_row_count_should_respect_max_results_on_creation( + paginated_bf_df: bf.dataframe.DataFrame, + max_results: typing.Optional[int], + expected_rows: int, +): + """ + Given a max_results value, when a TableWidget is created with custom batches, + its row_count should be correctly capped by that value. + """ + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + from bigframes.core.blocks import PandasBatches + from bigframes.display import TableWidget + + widget = TableWidget(paginated_bf_df) + + # Override batches with max_results to test the behavior + widget._batches = paginated_bf_df.to_pandas_batches( + page_size=widget.page_size, max_results=max_results + ) + + # Re-apply thelogic to update row_count + if isinstance(widget._batches, PandasBatches): + widget.row_count = widget._batches.total_rows or 0 + + assert widget.row_count == expected_rows + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From 6827775ac0030d746a4f9f6b41e69901a2bed393 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 24 Jul 2025 23:24:46 +0000 Subject: [PATCH 03/15] fix a typo --- tests/system/small/test_anywidget.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index fbb8851ef9..37ff53fa01 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -533,7 +533,7 @@ def test_widget_row_count_should_respect_max_results_on_creation( page_size=widget.page_size, max_results=max_results ) - # Re-apply thelogic to update row_count + # Re-apply the logic to update row_count if isinstance(widget._batches, PandasBatches): widget.row_count = widget._batches.total_rows or 0 From c5e2bafe45515d477c29584d7e32ca457ceccc88 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 25 Jul 2025 22:40:53 +0000 Subject: [PATCH 04/15] change how row_count is updated --- bigframes/display/anywidget.py | 13 ++--- notebooks/dataframes/dataframe.ipynb | 4 +- tests/system/small/test_anywidget.py | 81 ++++++++++++++++++---------- 3 files changed, 59 insertions(+), 39 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 9e70293816..8037082c5a 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -77,14 +77,11 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Initialize data fetching attributes. self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) - # Access total_rows through type casting (internal use only) - from bigframes.core.blocks import PandasBatches - - if isinstance(self._batches, PandasBatches): - self.row_count = self._batches.total_rows or 0 - else: - # Fallback for compatibility - self.row_count = 0 + # Get total rows efficiently by executing the query once + execute_result = dataframe._block.session._executor.execute( + dataframe._block.expr, ordered=True + ) + self.row_count = execute_result.total_rows or 0 # set traitlets properties that trigger observers self.page_size = initial_page_size diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index de9bb1d04f..ae03b56c72 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -5366,7 +5366,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -5380,7 +5380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 37ff53fa01..fa49c41f97 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -112,6 +112,32 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) +def mock_execute_total_rows_is_none(self, schema, *args, **kwargs): + """Mocks an execution result where the total row count is missing.""" + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + iter([]), # arrow_batches + schema=schema, + query_job=None, + total_bytes=None, + total_rows=None, # The specific failure condition for this case + ) + + +def mock_execute_batches_are_invalid(self, schema, *args, **kwargs): + """Mocks an execution result where the batch data is an invalid type.""" + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + None, # Invalid type for arrow_batches, which should be an iterator + schema=schema, + query_job=None, + total_bytes=None, + total_rows=100, # A valid row count, as the error is in the batch data + ) + + def _assert_html_matches_pandas_slice( table_html: str, expected_pd_slice: pd.DataFrame, @@ -461,46 +487,43 @@ def test_widget_row_count_should_be_immutable_after_creation( assert widget.row_count == initial_row_count -def test_widget_should_fallback_to_zero_rows_when_total_rows_is_none( - paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch -): - """ - Given an internal component that fails to provide a total row count, - when the widget is created, the row_count should safely fall back to 0. - """ - from bigframes.core.blocks import PandasBatches - - # Simulate an internal failure where total_rows returns None - monkeypatch.setattr(PandasBatches, "total_rows", property(lambda self: None)) - - with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget - - widget = TableWidget(paginated_bf_df) - - assert widget.row_count == 0 - - -def test_widget_should_fallback_to_zero_rows_when_batches_are_invalid_type( - paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch +@pytest.mark.parametrize( + "mock_function", + [ + mock_execute_total_rows_is_none, + mock_execute_batches_are_invalid, + ], + # 'ids' provides descriptive names for each test run in the pytest report. + ids=[ + "when_total_rows_is_None", + "when_arrow_batches_are_invalid", + ], +) +def test_widget_should_fallback_to_zero_rows_on_error( + paginated_bf_df: bf.dataframe.DataFrame, + monkeypatch: pytest.MonkeyPatch, + mock_function, ): """ - Given an internal component that returns an unexpected data type, - when the widget is created, the row_count should safely fall back to 0. + Given an internal component fails to return valid execution data, + when the TableWidget is created, its row_count should safely fall back to 0. """ - # Simulate internal method returning an unexpected type (a simple iterator) - def mock_to_pandas_batches(self, **kwargs): - return iter([paginated_bf_df.to_pandas().iloc[:2]]) - + # The 'self' argument is automatically handled when monkeypatch calls the method. + # We use a lambda to pass the DataFrame's schema to our mock function. monkeypatch.setattr( - "bigframes.dataframe.DataFrame.to_pandas_batches", mock_to_pandas_batches + "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", + lambda self, *args, **kwargs: mock_function( + self, paginated_bf_df._block.expr.schema, *args, **kwargs + ), ) with bf.option_context("display.repr_mode", "anywidget"): from bigframes.display import TableWidget + # The widget should handle the faulty data from the mock without crashing. widget = TableWidget(paginated_bf_df) + # The key assertion: The widget safely defaults to 0 rows. assert widget.row_count == 0 From 77915f3d36a347e4e84d020cdf4334cfe1a5e880 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 29 Jul 2025 18:59:06 +0000 Subject: [PATCH 05/15] testcase stil fails, need to merged in 1888 --- bigframes/display/anywidget.py | 32 ++++++---- tests/system/small/test_anywidget.py | 96 +++++++++++----------------- 2 files changed, 57 insertions(+), 71 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 8037082c5a..3e9b4ecffc 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -65,29 +65,35 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): super().__init__() self._dataframe = dataframe - # Initialize attributes that might be needed by observers FIRST + # Initialize attributes that might be needed by observers first self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None self._cached_batches: List[pd.DataFrame] = [] - # respect display options for initial page size + # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - # Initialize data fetching attributes. - self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) + try: + # Fetches initial data batches and row count for display. + # `to_pandas_batches` provides an iterable of pandas DataFrames + # and eagerly retrieves the total row count + self._batches = dataframe.to_pandas_batches( + page_size=initial_page_size, + ) - # Get total rows efficiently by executing the query once - execute_result = dataframe._block.session._executor.execute( - dataframe._block.expr, ordered=True - ) - self.row_count = execute_result.total_rows or 0 + # Access the total_rows property directly + self.row_count = self._batches.total_rows or 0 + self.page_size = initial_page_size - # set traitlets properties that trigger observers - self.page_size = initial_page_size + # Generates the initial HTML table content + self._set_table_html() - # get the initial page - self._set_table_html() + except Exception: + self.row_count = 0 + self.page_size = initial_page_size + self._batches = iter([]) + self.table_html = "" @functools.cached_property def _esm(self): diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index fa49c41f97..cba700edec 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typing - import pandas as pd import pytest @@ -112,29 +110,20 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) -def mock_execute_total_rows_is_none(self, schema, *args, **kwargs): - """Mocks an execution result where the total row count is missing.""" - from bigframes.session.executor import ExecuteResult - - return ExecuteResult( - iter([]), # arrow_batches - schema=schema, - query_job=None, - total_bytes=None, - total_rows=None, # The specific failure condition for this case - ) - - -def mock_execute_batches_are_invalid(self, schema, *args, **kwargs): - """Mocks an execution result where the batch data is an invalid type.""" +def mock_execute_result_with_params( + self, schema, total_rows_val, arrow_batches_val, *args, **kwargs +): + """ + Mocks an execution result with configurable total_rows and arrow_batches. + """ from bigframes.session.executor import ExecuteResult return ExecuteResult( - None, # Invalid type for arrow_batches, which should be an iterator + iter(arrow_batches_val), schema=schema, query_job=None, total_bytes=None, - total_rows=100, # A valid row count, as the error is in the batch data + total_rows=total_rows_val, ) @@ -475,25 +464,27 @@ def test_widget_row_count_should_be_immutable_after_creation( """ from bigframes.display import TableWidget + # Use a context manager to ensure the option is reset with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) initial_row_count = widget.row_count - assert initial_row_count == EXPECTED_ROW_COUNT # Change a global option that could influence row count bf.options.display.max_rows = 10 - # The widget's row count was fixed at creation and should not change. + # Verify the row count remains immutable. assert widget.row_count == initial_row_count @pytest.mark.parametrize( - "mock_function", + "total_rows_param, arrow_batches_param", [ - mock_execute_total_rows_is_none, - mock_execute_batches_are_invalid, + # Corresponds to mock_execute_total_rows_is_none + (None, []), + # Corresponds to mock_execute_batches_are_invalid (assuming empty list + # for invalid batches for now) + (100, []), ], - # 'ids' provides descriptive names for each test run in the pytest report. ids=[ "when_total_rows_is_None", "when_arrow_batches_are_invalid", @@ -502,65 +493,54 @@ def test_widget_row_count_should_be_immutable_after_creation( def test_widget_should_fallback_to_zero_rows_on_error( paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch, - mock_function, + total_rows_param, + arrow_batches_param, ): """ Given an internal component fails to return valid execution data, when the TableWidget is created, its row_count should safely fall back to 0. """ - # The 'self' argument is automatically handled when monkeypatch calls the method. - # We use a lambda to pass the DataFrame's schema to our mock function. + # Patch the executor's 'execute' method to simulate an error. monkeypatch.setattr( "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", - lambda self, *args, **kwargs: mock_function( - self, paginated_bf_df._block.expr.schema, *args, **kwargs + lambda self, *args, **kwargs: mock_execute_result_with_params( + self, + paginated_bf_df._block.expr.schema, + total_rows_param, + arrow_batches_param, + *args, + **kwargs ), ) + # Create the TableWidget under the error condition. with bf.option_context("display.repr_mode", "anywidget"): from bigframes.display import TableWidget # The widget should handle the faulty data from the mock without crashing. widget = TableWidget(paginated_bf_df) - # The key assertion: The widget safely defaults to 0 rows. + # The widget safely defaults to 0 rows. assert widget.row_count == 0 -@pytest.mark.parametrize( - "max_results, expected_rows", - [ - (None, EXPECTED_ROW_COUNT), - (3, 3), - (10, EXPECTED_ROW_COUNT), - ], - ids=["no_limit", "limit_is_less_than_total", "limit_is_greater_than_total"], -) -def test_widget_row_count_should_respect_max_results_on_creation( +def test_widget_row_count_reflects_actual_data_available( paginated_bf_df: bf.dataframe.DataFrame, - max_results: typing.Optional[int], - expected_rows: int, ): """ - Given a max_results value, when a TableWidget is created with custom batches, - its row_count should be correctly capped by that value. + Test that widget row_count reflects the actual data available, + regardless of theoretical limits. """ - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - from bigframes.core.blocks import PandasBatches - from bigframes.display import TableWidget + from bigframes.display import TableWidget + # Set up display options that define a page size. + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) - # Override batches with max_results to test the behavior - widget._batches = paginated_bf_df.to_pandas_batches( - page_size=widget.page_size, max_results=max_results - ) - - # Re-apply the logic to update row_count - if isinstance(widget._batches, PandasBatches): - widget.row_count = widget._batches.total_rows or 0 - - assert widget.row_count == expected_rows + # The widget should report the total rows in the DataFrame, + # not limited by page_size (which only affects pagination) + assert widget.row_count == EXPECTED_ROW_COUNT + assert widget.page_size == 2 # Respects the display option # TODO(shuowei): Add tests for custom index and multiindex From 8056b47aa512adfc25b23c2a18bdd2dc698d26f7 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 30 Jul 2025 03:27:32 +0000 Subject: [PATCH 06/15] update the method of using PandasBatches.total_rows --- bigframes/display/anywidget.py | 27 ++++++++++++++++++--------- tests/system/small/test_anywidget.py | 15 +++++++++++---- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 3e9b4ecffc..198fc1604d 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -17,12 +17,14 @@ from importlib import resources import functools import math -from typing import Any, Dict, Iterator, List, Optional, Type +import typing +from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid import pandas as pd import bigframes +import bigframes.core.blocks import bigframes.display.html # anywidget and traitlets are optional dependencies. We don't want the import of this @@ -69,6 +71,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None + self._batches: Optional[bigframes.core.blocks.PandasBatches] = None self._cached_batches: List[pd.DataFrame] = [] # Respect display options for initial page size @@ -76,14 +79,16 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): try: # Fetches initial data batches and row count for display. - # `to_pandas_batches` provides an iterable of pandas DataFrames - # and eagerly retrieves the total row count - self._batches = dataframe.to_pandas_batches( + batches = dataframe.to_pandas_batches( page_size=initial_page_size, ) + self._batches = cast(bigframes.core.blocks.PandasBatches, batches) - # Access the total_rows property directly - self.row_count = self._batches.total_rows or 0 + # Use total_rows if available, otherwise default to 0. + if self._batches: + self.row_count = self._batches.total_rows or 0 + else: + self.row_count = 0 self.page_size = initial_page_size # Generates the initial HTML table content @@ -92,7 +97,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): except Exception: self.row_count = 0 self.page_size = initial_page_size - self._batches = iter([]) + self._batches = None self.table_html = "" @functools.cached_property @@ -176,7 +181,10 @@ def _get_next_batch(self) -> bool: def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" if self._batch_iter is None: - self._batch_iter = iter(self._batches) + if self._batches is None: + self._batch_iter = iter([]) + else: + self._batch_iter = iter(self._batches) return self._batch_iter @property @@ -188,7 +196,8 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self): """Reset the batch iterator when page size changes.""" - self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches) self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index cba700edec..716f44f039 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -476,14 +476,21 @@ def test_widget_row_count_should_be_immutable_after_creation( assert widget.row_count == initial_row_count +class FaultyIterator: + def __iter__(self): + return self + + def __next__(self): + raise ValueError("Simulated read error") + + @pytest.mark.parametrize( "total_rows_param, arrow_batches_param", [ - # Corresponds to mock_execute_total_rows_is_none + # Case 1: total_rows is None, which should be handled gracefully. (None, []), - # Corresponds to mock_execute_batches_are_invalid (assuming empty list - # for invalid batches for now) - (100, []), + # Case 2: Batches are invalid and will raise an error during iteration. + (100, FaultyIterator()), ], ids=[ "when_total_rows_is_None", From d1a0c446d380d22b6b379ef2096afe97be82b5a5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 1 Aug 2025 08:06:01 +0000 Subject: [PATCH 07/15] change tests in read_gbq_colab --- bigframes/display/anywidget.py | 31 ++++++++------------ tests/benchmark/read_gbq_colab/first_page.py | 5 ++-- tests/benchmark/read_gbq_colab/last_page.py | 4 +-- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 198fc1604d..131bf7fe5a 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -77,28 +77,21 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - try: - # Fetches initial data batches and row count for display. - batches = dataframe.to_pandas_batches( - page_size=initial_page_size, - ) - self._batches = cast(bigframes.core.blocks.PandasBatches, batches) + # Fetches initial data batches and row count for display. + batches = dataframe.to_pandas_batches( + page_size=initial_page_size, + ) + self._batches = cast(bigframes.core.blocks.PandasBatches, batches) - # Use total_rows if available, otherwise default to 0. - if self._batches: - self.row_count = self._batches.total_rows or 0 - else: - self.row_count = 0 - self.page_size = initial_page_size + # Use total_rwos from batches directly + self.row_count = self._batches.total_rows or 0 - # Generates the initial HTML table content - self._set_table_html() + # Set page_size after _batches is initialized so observers have + # access to batch data + self.page_size = initial_page_size - except Exception: - self.row_count = 0 - self.page_size = initial_page_size - self._batches = None - self.table_html = "" + # Generates the initial HTML table content + self._set_table_html() @functools.cached_property def _esm(self): diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 7f8cdb0d51..2c57750d1f 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -27,8 +27,9 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Get number of rows (to calculate number of pages) and the first page. - df.shape + # Use total_rows from batches directly and the first page + execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) + execute_result.total_rows or 0 next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 7786e2f8bd..57796cab88 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -27,8 +27,8 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Get number of rows (to calculate number of pages) and then all pages. - df.shape + execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) + execute_result.total_rows or 0 for _ in df.to_pandas_batches(page_size=PAGE_SIZE): pass From cba67a0a36554bc02dca693c809a0cb9eccc31b9 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 1 Aug 2025 08:12:03 +0000 Subject: [PATCH 08/15] polish comment --- bigframes/display/anywidget.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 131bf7fe5a..3f78875a8a 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -86,8 +86,8 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Use total_rwos from batches directly self.row_count = self._batches.total_rows or 0 - # Set page_size after _batches is initialized so observers have - # access to batch data + # Set page_size after _batches is available since traitlets observers + # may depend on _batches being initialized when the change trigger happens self.page_size = initial_page_size # Generates the initial HTML table content From 61752ccc50a38614b94228e508ce0f630db2f98a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 6 Aug 2025 20:54:27 +0000 Subject: [PATCH 09/15] fix a test --- bigframes/display/anywidget.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 3f78875a8a..f782505ca2 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -21,6 +21,7 @@ from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid +import google.api_core.exceptions import pandas as pd import bigframes @@ -166,8 +167,16 @@ def _get_next_batch(self) -> bool: batch = next(iterator) self._cached_batches.append(batch) return True - except StopIteration: + except ( + StopIteration, + google.api_core.exceptions.GoogleAPICallError, + TypeError, + ValueError, + ) as e: self._all_data_loaded = True + if not isinstance(e, StopIteration): + # If we fail to get a batch, assume no more data is available. + self.row_count = 0 return False @property From 01300e96464435f65cfc2cee05de989009c08469 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 12 Aug 2025 00:21:40 +0000 Subject: [PATCH 10/15] change code and update more testcase --- bigframes/display/anywidget.py | 17 +++++++---------- .../read_gbq_colab/aggregate_output.py | 8 ++++---- tests/benchmark/read_gbq_colab/filter_output.py | 10 ++++++---- tests/benchmark/read_gbq_colab/first_page.py | 11 +++++++---- tests/benchmark/read_gbq_colab/last_page.py | 6 +++--- tests/benchmark/read_gbq_colab/sort_output.py | 8 ++++---- 6 files changed, 31 insertions(+), 29 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index f782505ca2..c364405db5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -72,26 +72,27 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None - self._batches: Optional[bigframes.core.blocks.PandasBatches] = None self._cached_batches: List[pd.DataFrame] = [] # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - # Fetches initial data batches and row count for display. batches = dataframe.to_pandas_batches( page_size=initial_page_size, ) - self._batches = cast(bigframes.core.blocks.PandasBatches, batches) + self._batches: bigframes.core.blocks.PandasBatches = cast( + bigframes.core.blocks.PandasBatches, batches + ) - # Use total_rwos from batches directly + # The query issued by `to_pandas_batches()` already contains metadata + # about how many results there were. Use that to avoid doing an extra + # COUNT(*) query that `len(...)` would do. self.row_count = self._batches.total_rows or 0 # Set page_size after _batches is available since traitlets observers # may depend on _batches being initialized when the change trigger happens self.page_size = initial_page_size - # Generates the initial HTML table content self._set_table_html() @functools.cached_property @@ -182,11 +183,7 @@ def _get_next_batch(self) -> bool: @property def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" - if self._batch_iter is None: - if self._batches is None: - self._batch_iter = iter([]) - else: - self._batch_iter = iter(self._batches) + self._batch_iter = iter(self._batches) return self._batch_iter @property diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index cd33ed2640..52ed95678e 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -26,8 +26,8 @@ def aggregate_output(*, project_id, dataset_id, table_id): df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches)) # To simulate very small rows that can only fit a boolean, # some tables don't have an integer column. If an integer column is available, @@ -43,8 +43,8 @@ def aggregate_output(*, project_id, dataset_id, table_id): .sum(numeric_only=True) ) - df_aggregated.shape - next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches_aggregated)) if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index b3c9181770..7ae0398a6e 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -31,16 +31,18 @@ def filter_output( df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - rows, _ = df_filtered.shape + batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) # It's possible we don't have any pages at all, since we filtered out all # matching rows. - first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + first_page = next(iter(batches_filtered)) + rows = batches_filtered.total_rows + assert rows is not None assert len(first_page.index) <= rows diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 2c57750d1f..3f21693522 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils @@ -27,10 +28,12 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Use total_rows from batches directly and the first page - execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) - execute_result.total_rows or 0 - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + # Get number of rows (to calculate number of pages) and the first page. + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + first_page = next(iter(batches)) + assert first_page is not None + total_rows = typing.cast(typing.Any, batches).total_rows + assert total_rows is not None if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 57796cab88..e00b304900 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -27,9 +27,9 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) - execute_result.total_rows or 0 - for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + # Get number of rows (to calculate number of pages) and then all pages. + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + for _ in batches: pass diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 7933c4472e..ded42b77e5 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -28,8 +28,8 @@ def sort_output(*, project_id, dataset_id, table_id): ) # Simulate getting the first page, since we'll always do that first in the UI. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches)) # Simulate the user sorting by a column and visualizing those results sort_column = "col_int64_1" @@ -37,8 +37,8 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - df_sorted.shape - next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE))) + batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches_sorted)) if __name__ == "__main__": From c31b1120dc842eef5c47fffc70dfb3ed81cce596 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 14 Aug 2025 21:55:53 +0000 Subject: [PATCH 11/15] remove unneeded except --- bigframes/display/anywidget.py | 8 +------- tests/benchmark/read_gbq_colab/filter_output.py | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index c364405db5..95fd6ddd68 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -21,7 +21,6 @@ from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid -import google.api_core.exceptions import pandas as pd import bigframes @@ -168,12 +167,7 @@ def _get_next_batch(self) -> bool: batch = next(iterator) self._cached_batches.append(batch) return True - except ( - StopIteration, - google.api_core.exceptions.GoogleAPICallError, - TypeError, - ValueError, - ) as e: + except StopIteration as e: self._all_data_loaded = True if not isinstance(e, StopIteration): # If we fail to get a batch, assume no more data is available. diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 7ae0398a6e..d8a8fd1abb 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -37,6 +37,7 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) + assert batches_filtered.total_rows >= 0 # It's possible we don't have any pages at all, since we filtered out all # matching rows. From 156c5bac6f8a7875652adac8b35f141d179aed6d Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 14 Aug 2025 22:11:59 +0000 Subject: [PATCH 12/15] add assert for total_rows --- tests/benchmark/read_gbq_colab/aggregate_output.py | 2 ++ tests/benchmark/read_gbq_colab/filter_output.py | 11 +++++++---- tests/benchmark/read_gbq_colab/first_page.py | 3 +-- tests/benchmark/read_gbq_colab/sort_output.py | 3 +++ 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index 52ed95678e..891991d9f7 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils @@ -27,6 +28,7 @@ def aggregate_output(*, project_id, dataset_id, table_id): # Simulate getting the first page, since we'll always do that first in the UI. batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches).total_rows >= 0 next(iter(batches)) # To simulate very small rows that can only fit a boolean, diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index d8a8fd1abb..363203fd83 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils +import bigframes.core.blocks import bigframes.pandas as bpd PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -37,13 +39,14 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) - assert batches_filtered.total_rows >= 0 - + batches_filtered = typing.cast( + bigframes.core.blocks.PandasBatches, batches_filtered + ) + rows = batches_filtered.total_rows + assert rows >= 0 # It's possible we don't have any pages at all, since we filtered out all # matching rows. first_page = next(iter(batches_filtered)) - rows = batches_filtered.total_rows - assert rows is not None assert len(first_page.index) <= rows diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 3f21693522..16d4d9ad01 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -30,10 +30,9 @@ def first_page(*, project_id, dataset_id, table_id): # Get number of rows (to calculate number of pages) and the first page. batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches).total_rows >= 0 first_page = next(iter(batches)) assert first_page is not None - total_rows = typing.cast(typing.Any, batches).total_rows - assert total_rows is not None if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index ded42b77e5..2443cb25a5 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils @@ -29,6 +30,7 @@ def sort_output(*, project_id, dataset_id, table_id): # Simulate getting the first page, since we'll always do that first in the UI. batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches).total_rows >= 0 next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -38,6 +40,7 @@ def sort_output(*, project_id, dataset_id, table_id): df_sorted = df.sort_values(sort_column) batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches_sorted).total_rows >= 0 next(iter(batches_sorted)) From 6b87339393adc6cd71cd863344cc9343511cc1f0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 19 Aug 2025 21:11:38 +0000 Subject: [PATCH 13/15] get actual row_counts --- bigframes/display/anywidget.py | 29 ++++-- notebooks/dataframes/anywidget_mode.ipynb | 96 +++++++++++++------ .../read_gbq_colab/aggregate_output.py | 24 ++++- .../benchmark/read_gbq_colab/filter_output.py | 24 +++-- tests/benchmark/read_gbq_colab/first_page.py | 10 +- tests/benchmark/read_gbq_colab/sort_output.py | 22 ++++- tests/system/small/test_anywidget.py | 24 +---- 7 files changed, 149 insertions(+), 80 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 95fd6ddd68..34e6ae1933 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -17,8 +17,7 @@ from importlib import resources import functools import math -import typing -from typing import Any, cast, Dict, Iterator, List, Optional, Type +from typing import Any, Dict, Iterator, List, Optional, Type import uuid import pandas as pd @@ -76,17 +75,19 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - batches = dataframe.to_pandas_batches( - page_size=initial_page_size, - ) - self._batches: bigframes.core.blocks.PandasBatches = cast( - bigframes.core.blocks.PandasBatches, batches + execute_result = dataframe._block.session._executor.execute( + dataframe._block.expr, + ordered=True, + use_explicit_destination=True, ) # The query issued by `to_pandas_batches()` already contains metadata # about how many results there were. Use that to avoid doing an extra # COUNT(*) query that `len(...)` would do. - self.row_count = self._batches.total_rows or 0 + self.row_count = execute_result.total_rows or 0 + + # Create pandas batches from the ExecuteResult + self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) # Set page_size after _batches is available since traitlets observers # may depend on _batches being initialized when the change trigger happens @@ -189,8 +190,16 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self): """Reset the batch iterator when page size changes.""" - batches = self._dataframe.to_pandas_batches(page_size=self.page_size) - self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches) + # Execute with explicit destination for consistency with __init__ + execute_result = self._dataframe._block.session._executor.execute( + self._dataframe._block.expr, + ordered=True, + use_explicit_destination=True, + ) + + # Create pandas batches from the ExecuteResult + self._batches = execute_result.to_pandas_batches(page_size=self.page_size) + self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 8bffb23df0..a441d8766f 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,18 +73,6 @@ "id": "f289d250", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -142,7 +130,19 @@ { "data": { "text/html": [ - "Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. Open Job" + "Query job 3245c62b-5969-4b78-b1f2-4330592d3c65 is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 1a5cec48-7128-4986-86a6-369a8f366974 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -154,12 +154,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e74c3920b93644a0b2afdaa3841cad31", + "model_id": "d59362abcff6445ea879b5f43e0ca9b3", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='= 0 + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # To simulate very small rows that can only fit a boolean, @@ -44,8 +48,18 @@ def aggregate_output(*, project_id, dataset_id, table_id): .groupby("rounded") .sum(numeric_only=True) ) - - batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE) + execute_result_aggregated = df_aggregated._block.session._executor.execute( + df_aggregated._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert ( + execute_result_aggregated.total_rows is not None + and execute_result_aggregated.total_rows >= 0 + ) + batches_aggregated = execute_result_aggregated.to_pandas_batches( + page_size=PAGE_SIZE + ) next(iter(batches_aggregated)) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 363203fd83..b1dfdf3424 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib -import typing import benchmark.utils as utils -import bigframes.core.blocks import bigframes.pandas as bpd PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -33,17 +31,29 @@ def filter_output( df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) + # Force BigQuery execution to get total_rows metadata + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) - batches_filtered = typing.cast( - bigframes.core.blocks.PandasBatches, batches_filtered + # Force BigQuery execution for filtered DataFrame to get total_rows metadata + execute_result_filtered = df_filtered._block.session._executor.execute( + df_filtered._block.expr, + ordered=True, + use_explicit_destination=True, ) - rows = batches_filtered.total_rows + + rows = execute_result_filtered.total_rows or 0 assert rows >= 0 + + batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE) + # It's possible we don't have any pages at all, since we filtered out all # matching rows. first_page = next(iter(batches_filtered)) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 16d4d9ad01..90bd4024cb 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib -import typing import benchmark.utils as utils @@ -29,8 +28,13 @@ def first_page(*, project_id, dataset_id, table_id): ) # Get number of rows (to calculate number of pages) and the first page. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert typing.cast(typing.Any, batches).total_rows >= 0 + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) first_page = next(iter(batches)) assert first_page is not None diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 2443cb25a5..9724373dde 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib -import typing import benchmark.utils as utils @@ -29,8 +28,13 @@ def sort_output(*, project_id, dataset_id, table_id): ) # Simulate getting the first page, since we'll always do that first in the UI. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert typing.cast(typing.Any, batches).total_rows >= 0 + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -39,8 +43,16 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) - assert typing.cast(typing.Any, batches_sorted).total_rows >= 0 + execute_result_sorted = df_sorted._block.session._executor.execute( + df_sorted._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert ( + execute_result_sorted.total_rows is not None + and execute_result_sorted.total_rows >= 0 + ) + batches_sorted = execute_result_sorted.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches_sorted)) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 716f44f039..2103c52dbb 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -484,24 +484,9 @@ def __next__(self): raise ValueError("Simulated read error") -@pytest.mark.parametrize( - "total_rows_param, arrow_batches_param", - [ - # Case 1: total_rows is None, which should be handled gracefully. - (None, []), - # Case 2: Batches are invalid and will raise an error during iteration. - (100, FaultyIterator()), - ], - ids=[ - "when_total_rows_is_None", - "when_arrow_batches_are_invalid", - ], -) -def test_widget_should_fallback_to_zero_rows_on_error( +def test_widget_should_fallback_to_zero_rows_with_invlid_total_rows( paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch, - total_rows_param, - arrow_batches_param, ): """ Given an internal component fails to return valid execution data, @@ -511,12 +496,7 @@ def test_widget_should_fallback_to_zero_rows_on_error( monkeypatch.setattr( "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", lambda self, *args, **kwargs: mock_execute_result_with_params( - self, - paginated_bf_df._block.expr.schema, - total_rows_param, - arrow_batches_param, - *args, - **kwargs + self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs ), ) From b442fceca5ad4d46eb5d8236c04d9b7a3f5be7b5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 19 Aug 2025 21:21:29 +0000 Subject: [PATCH 14/15] avoid two query calls --- bigframes/display/anywidget.py | 12 +++--- notebooks/dataframes/anywidget_mode.ipynb | 48 +++-------------------- 2 files changed, 11 insertions(+), 49 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 34e6ae1933..1ca57e89ef 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -89,9 +89,10 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Create pandas batches from the ExecuteResult self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) - # Set page_size after _batches is available since traitlets observers - # may depend on _batches being initialized when the change trigger happens - self.page_size = initial_page_size + # Set page_size after _batches is available, but avoid triggering observers + # by setting the underlying traitlet value directly + self._trait_values["page_size"] = initial_page_size + self._trait_notifiers["page_size"] = {} # Initialize notifiers if needed self._set_table_html() @@ -168,11 +169,8 @@ def _get_next_batch(self) -> bool: batch = next(iterator) self._cached_batches.append(batch) return True - except StopIteration as e: + except StopIteration: self._all_data_loaded = True - if not isinstance(e, StopIteration): - # If we fail to get a batch, assume no more data is available. - self.row_count = 0 return False @property diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index a441d8766f..34d9fae12b 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -130,19 +130,7 @@ { "data": { "text/html": [ - "Query job 3245c62b-5969-4b78-b1f2-4330592d3c65 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1a5cec48-7128-4986-86a6-369a8f366974 is DONE. 171.4 MB processed. Open Job" + "Query job 087c4276-8c26-467f-852b-c0d31848f666 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -154,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d59362abcff6445ea879b5f43e0ca9b3", + "model_id": "c2a4111b39c3462a8d0f4f2e4a01635b", "version_major": 2, "version_minor": 1 }, @@ -198,19 +186,7 @@ { "data": { "text/html": [ - "Query job 356f561b-5017-413f-950b-2bc4c7798a24 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 72162728-56a3-47ce-bdb1-61b038cc2146 is DONE. 171.4 MB processed. Open Job" + "Query job 86d748cf-699c-407c-8eba-2d6421375aad is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -229,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8fac39e9b92e42d283883137f155526f", + "model_id": "401985bd2b3f40f3a2f7e48eeabb272d", "version_major": 2, "version_minor": 1 }, @@ -316,19 +292,7 @@ { "data": { "text/html": [ - "Query job 77f0582b-b68c-46a7-bf25-463837a4ef3f is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ec2bcbc2-0f5a-45e9-affc-485183cb245e is DONE. 171.4 MB processed. Open Job" + "Query job 2cb31c3a-ccbc-40fc-b548-ce8503fd2cc3 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -347,7 +311,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fe6358fd83d6431198944e601ea00372", + "model_id": "9d2e3ced089a4cadbec9eb06d3724237", "version_major": 2, "version_minor": 1 }, From 0caaa52a4c39c914f6c1836ae455c9c3a2430da7 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 21 Aug 2025 21:06:22 +0000 Subject: [PATCH 15/15] remove double query when display widget --- bigframes/display/anywidget.py | 13 ++++++++----- notebooks/dataframes/anywidget_mode.ipynb | 16 ++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ca57e89ef..a916823e9c 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -63,8 +63,9 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) - super().__init__() self._dataframe = dataframe + self._initializing = True + super().__init__() # Initialize attributes that might be needed by observers first self._table_id = str(uuid.uuid4()) @@ -89,12 +90,10 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Create pandas batches from the ExecuteResult self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) - # Set page_size after _batches is available, but avoid triggering observers - # by setting the underlying traitlet value directly - self._trait_values["page_size"] = initial_page_size - self._trait_notifiers["page_size"] = {} # Initialize notifiers if needed + self.page_size = initial_page_size self._set_table_html() + self._initializing = False @functools.cached_property def _esm(self): @@ -227,11 +226,15 @@ def _set_table_html(self): @traitlets.observe("page") def _page_changed(self, _change: Dict[str, Any]): """Handler for when the page number is changed from the frontend.""" + if self._initializing: + return self._set_table_html() @traitlets.observe("page_size") def _page_size_changed(self, _change: Dict[str, Any]): """Handler for when the page size is changed from the frontend.""" + if self._initializing: + return # Reset the page to 0 when page size changes to avoid invalid page states self.page = 0 diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 34d9fae12b..05ef99c1c3 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -130,7 +130,7 @@ { "data": { "text/html": [ - "Query job 087c4276-8c26-467f-852b-c0d31848f666 is DONE. 171.4 MB processed. Open Job" + "Query job 1171b7b3-3f65-4165-a69d-69dad5a100d1 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c2a4111b39c3462a8d0f4f2e4a01635b", + "model_id": "6b70bf0e30a04a3cab11e03b2ed80856", "version_major": 2, "version_minor": 1 }, @@ -186,7 +186,7 @@ { "data": { "text/html": [ - "Query job 86d748cf-699c-407c-8eba-2d6421375aad is DONE. 171.4 MB processed. Open Job" + "Query job 3100859b-c57c-42fe-a5fb-abb4f2f25db2 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -205,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "401985bd2b3f40f3a2f7e48eeabb272d", + "model_id": "4714b0794f55435a8d3e136517158a5c", "version_major": 2, "version_minor": 1 }, @@ -292,7 +292,7 @@ { "data": { "text/html": [ - "Query job 2cb31c3a-ccbc-40fc-b548-ce8503fd2cc3 is DONE. 171.4 MB processed. Open Job" + "Query job b4143f15-4bac-44a5-bb29-c5056f95b30b is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -311,7 +311,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9d2e3ced089a4cadbec9eb06d3724237", + "model_id": "c70b5611db6b4e6a806a16d0a8287cd3", "version_major": 2, "version_minor": 1 }, @@ -335,7 +335,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -349,7 +349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.10.15" } }, "nbformat": 4,