Skip to content
Open
56 changes: 38 additions & 18 deletions bigframes/display/anywidget.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pandas as pd

import bigframes
import bigframes.core.blocks
import bigframes.display.html

# anywidget and traitlets are optional dependencies. We don't want the import of this
Expand All @@ -45,8 +46,10 @@


class TableWidget(WIDGET_BASE):
"""
An interactive, paginated table widget for BigFrames DataFrames.
"""An interactive, paginated table widget for BigFrames DataFrames.

This widget provides a user-friendly way to display and navigate through
large BigQuery DataFrames within a Jupyter environment.
"""

def __init__(self, dataframe: bigframes.dataframe.DataFrame):
Expand All @@ -60,32 +63,37 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget."
)

super().__init__()
self._dataframe = dataframe
self._initializing = True
super().__init__()

# Initialize attributes that might be needed by observers FIRST
# Initialize attributes that might be needed by observers first
self._table_id = str(uuid.uuid4())
self._all_data_loaded = False
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
self._cached_batches: List[pd.DataFrame] = []

# respect display options for initial page size
# Respect display options for initial page size
initial_page_size = bigframes.options.display.max_rows

# Initialize data fetching attributes.
self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)
execute_result = dataframe._block.session._executor.execute(
dataframe._block.expr,
ordered=True,
use_explicit_destination=True,
)

# set traitlets properties that trigger observers
self.page_size = initial_page_size
# The query issued by `to_pandas_batches()` already contains metadata
# about how many results there were. Use that to avoid doing an extra
# COUNT(*) query that `len(...)` would do.
self.row_count = execute_result.total_rows or 0

# len(dataframe) is expensive, since it will trigger a
# SELECT COUNT(*) query. It is a must have however.
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
# before we get here so that the count might already be cached.
self.row_count = len(dataframe)
# Create pandas batches from the ExecuteResult
self._batches = execute_result.to_pandas_batches(page_size=initial_page_size)

self.page_size = initial_page_size

# get the initial page
self._set_table_html()
self._initializing = False

@functools.cached_property
def _esm(self):
Expand Down Expand Up @@ -167,8 +175,7 @@ def _get_next_batch(self) -> bool:
@property
def _batch_iterator(self) -> Iterator[pd.DataFrame]:
"""Lazily initializes and returns the batch iterator."""
if self._batch_iter is None:
self._batch_iter = iter(self._batches)
self._batch_iter = iter(self._batches)
return self._batch_iter

@property
Expand All @@ -180,7 +187,16 @@ def _cached_data(self) -> pd.DataFrame:

def _reset_batches_for_new_page_size(self):
"""Reset the batch iterator when page size changes."""
self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
# Execute with explicit destination for consistency with __init__
execute_result = self._dataframe._block.session._executor.execute(
self._dataframe._block.expr,
ordered=True,
use_explicit_destination=True,
)

# Create pandas batches from the ExecuteResult
self._batches = execute_result.to_pandas_batches(page_size=self.page_size)

self._cached_batches = []
self._batch_iter = None
self._all_data_loaded = False
Expand Down Expand Up @@ -210,11 +226,15 @@ def _set_table_html(self):
@traitlets.observe("page")
def _page_changed(self, _change: Dict[str, Any]):
"""Handler for when the page number is changed from the frontend."""
if self._initializing:
return
self._set_table_html()

@traitlets.observe("page_size")
def _page_size_changed(self, _change: Dict[str, Any]):
"""Handler for when the page size is changed from the frontend."""
if self._initializing:
return
# Reset the page to 0 when page size changes to avoid invalid page states
self.page = 0

Expand Down
74 changes: 45 additions & 29 deletions notebooks/dataframes/anywidget_mode.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,6 @@
"id": "f289d250",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a643d120-4af9-44fc-ba3c-ed461cf1092b&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -139,15 +127,27 @@
"id": "ce250157",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 1171b7b3-3f65-4165-a69d-69dad5a100d1 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1171b7b3-3f65-4165-a69d-69dad5a100d1&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635",
"model_id": "6b70bf0e30a04a3cab11e03b2ed80856",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"table table-striped table-ho…"
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
]
},
"metadata": {},
Expand Down Expand Up @@ -183,6 +183,18 @@
"id": "6920d49b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 3100859b-c57c-42fe-a5fb-abb4f2f25db2 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:3100859b-c57c-42fe-a5fb-abb4f2f25db2&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -193,12 +205,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "121e3d2f28004036a922e3a11a08d4b7",
"model_id": "4714b0794f55435a8d3e136517158a5c",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"table table-striped table-ho…"
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
]
},
"execution_count": 7,
Expand Down Expand Up @@ -273,10 +285,22 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n",
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n",
" warnings.warn(msg, bfe.AmbiguousWindowWarning)\n"
]
},
{
"data": {
"text/html": [
"Query job b4143f15-4bac-44a5-bb29-c5056f95b30b is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b4143f15-4bac-44a5-bb29-c5056f95b30b&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -287,12 +311,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ed335bbbc064e5391ea06a9a218642e",
"model_id": "c70b5611db6b4e6a806a16d0a8287cd3",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"table table-striped table-hover\" i…"
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
]
},
"execution_count": 9,
Expand All @@ -307,19 +331,11 @@
"print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
"small_widget"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4e5836b-c872-4a9c-b9ec-14f6f338176d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -333,7 +349,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions notebooks/dataframes/dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5366,7 +5366,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -5380,7 +5380,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
26 changes: 21 additions & 5 deletions tests/benchmark/read_gbq_colab/aggregate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,14 @@ def aggregate_output(*, project_id, dataset_id, table_id):
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
execute_result = df._block.session._executor.execute(
df._block.expr,
ordered=True,
use_explicit_destination=True,
)
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches))

# To simulate very small rows that can only fit a boolean,
# some tables don't have an integer column. If an integer column is available,
Expand All @@ -42,9 +48,19 @@ def aggregate_output(*, project_id, dataset_id, table_id):
.groupby("rounded")
.sum(numeric_only=True)
)

df_aggregated.shape
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
execute_result_aggregated = df_aggregated._block.session._executor.execute(
df_aggregated._block.expr,
ordered=True,
use_explicit_destination=True,
)
assert (
execute_result_aggregated.total_rows is not None
and execute_result_aggregated.total_rows >= 0
)
batches_aggregated = execute_result_aggregated.to_pandas_batches(
page_size=PAGE_SIZE
)
next(iter(batches_aggregated))


if __name__ == "__main__":
Expand Down
24 changes: 20 additions & 4 deletions tests/benchmark/read_gbq_colab/filter_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,32 @@ def filter_output(
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
# Force BigQuery execution to get total_rows metadata
execute_result = df._block.session._executor.execute(
df._block.expr,
ordered=True,
use_explicit_destination=True,
)
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches))

# Simulate the user filtering by a column and visualizing those results
df_filtered = df[df["col_bool_0"]]
rows, _ = df_filtered.shape
# Force BigQuery execution for filtered DataFrame to get total_rows metadata
execute_result_filtered = df_filtered._block.session._executor.execute(
df_filtered._block.expr,
ordered=True,
use_explicit_destination=True,
)

rows = execute_result_filtered.total_rows or 0
assert rows >= 0

batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE)

# It's possible we don't have any pages at all, since we filtered out all
# matching rows.
first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
first_page = next(iter(batches_filtered))
assert len(first_page.index) <= rows


Expand Down
11 changes: 9 additions & 2 deletions tests/benchmark/read_gbq_colab/first_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,15 @@ def first_page(*, project_id, dataset_id, table_id):
)

# Get number of rows (to calculate number of pages) and the first page.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
execute_result = df._block.session._executor.execute(
df._block.expr,
ordered=True,
use_explicit_destination=True,
)
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
first_page = next(iter(batches))
assert first_page is not None


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions tests/benchmark/read_gbq_colab/last_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def last_page(*, project_id, dataset_id, table_id):
)

# Get number of rows (to calculate number of pages) and then all pages.
df.shape
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
for _ in batches:
pass


Expand Down
Loading