From 28afa2c73c0517f9365fab05193706631b656551 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 22 Apr 2025 11:05:43 -0700 Subject: [PATCH 01/28] fix: DataFrame accessors is not pupulated (#1639) --- docs/reference/bigframes.pandas/frame.rst | 2 +- docs/reference/bigframes.pandas/series.rst | 10 +++++----- docs/templates/toc.yml | 4 ++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index 4e231bd821..ea4c6dec1c 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -22,7 +22,7 @@ Accessors Plotting handling ^^^^^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.plotting +.. autoclass:: bigframes.operations.plotting.PlotAccessor :members: :inherited-members: :undoc-members: diff --git a/docs/reference/bigframes.pandas/series.rst b/docs/reference/bigframes.pandas/series.rst index 3cfb7d89dd..41b1529b0c 100644 --- a/docs/reference/bigframes.pandas/series.rst +++ b/docs/reference/bigframes.pandas/series.rst @@ -22,7 +22,7 @@ Accessors Datetime properties ^^^^^^^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.datetimes +.. autoclass:: bigframes.operations.datetimes.DatetimeMethods :members: :inherited-members: :undoc-members: @@ -30,7 +30,7 @@ Datetime properties String handling ^^^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.strings +.. autoclass:: bigframes.operations.strings.StringMethods :members: :inherited-members: :undoc-members: @@ -38,7 +38,7 @@ String handling List handling ^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.lists +.. autoclass:: bigframes.operations.lists.ListAccessor :members: :inherited-members: :undoc-members: @@ -54,7 +54,7 @@ Struct handling Blob handling ^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.blob +.. autoclass:: bigframes.operations.blob.BlobAccessor :members: :inherited-members: :undoc-members: @@ -62,7 +62,7 @@ Blob handling Plotting handling ^^^^^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.plotting +.. autoclass:: bigframes.operations.plotting.PlotAccessor :members: :inherited-members: :undoc-members: diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 1e712848df..f70e81d196 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -47,6 +47,8 @@ uid: bigframes.operations.structs.StructFrameAccessor - name: AI uid: bigframes.operations.ai.AIAccessor + status: beta + name: DataFrame - items: - name: DataFrameGroupBy uid: bigframes.core.groupby.DataFrameGroupBy @@ -84,6 +86,8 @@ uid: bigframes.operations.strings.StringMethods - name: StructAccessor uid: bigframes.operations.structs.StructAccessor + - name: ListAccessor + uid: bigframes.operations.lists.ListAccessor - name: PlotAccessor uid: bigframes.operations.plotting.PlotAccessor - name: BlobAccessor From 48d10d1f0150a29dd3b91f505f8d3874e0b88c42 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 22 Apr 2025 11:58:38 -0700 Subject: [PATCH 02/28] docs: note that `udf` is in preview and must be python 3.11 compatible (#1629) * docs: note that `udf` is in preview and must be python 3.11 compatible * user stacklevel=5 to optimize the warning for bpd.udf entry point --- bigframes/functions/_function_session.py | 9 ++++++--- bigframes/session/__init__.py | 7 +++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 6d2f0dbd57..7fb5cc114b 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -740,9 +740,12 @@ def udf( BigQuery managed function. .. note:: - The udf must be self-contained, i.e. it must not contain any + This feature is in preview. The code in the udf must be + (1) self-contained, i.e. it must not contain any references to an import or variable defined outside the function - body. + body, and + (2) Python 3.11 compatible, as that is the environment + in which the code is executed in the cloud. .. note:: Please have following IAM roles enabled for you: @@ -801,7 +804,7 @@ def udf( https://pip.pypa.io/en/stable/reference/requirements-file-format/. """ - warnings.warn("udf is in preview.", category=bfe.PreviewWarning) + warnings.warn("udf is in preview.", category=bfe.PreviewWarning, stacklevel=5) # Some defaults may be used from the session if not provided otherwise. session = self._resolve_session(session) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7229a4641b..b189c0e194 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1426,9 +1426,12 @@ def udf( [BigQuery managed user-defined function](https://cloud.google.com/bigquery/docs/user-defined-functions-python). .. note:: - The udf must be self-contained, i.e. it must not contain any + This feature is in preview. The code in the udf must be + (1) self-contained, i.e. it must not contain any references to an import or variable defined outside the function - body. + body, and + (2) Python 3.11 compatible, as that is the environment + in which the code is executed in the cloud. .. note:: Please have BigQuery Data Editor (roles/bigquery.dataEditor) IAM From 53fc25bfc86e166b91e5001506051b1cac34c996 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 22 Apr 2025 12:22:55 -0700 Subject: [PATCH 03/28] fix: Prefer remote schema instead of throwing on materialize conflicts (#1644) --- bigframes/session/executor.py | 15 +++++++++++---- tests/system/small/test_dataframe.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 4c27c25058..496ff560dd 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -16,6 +16,7 @@ import abc import dataclasses +import itertools from typing import Callable, Iterator, Literal, Mapping, Optional, Sequence, Union from google.cloud import bigquery @@ -37,10 +38,16 @@ def to_arrow_table(self) -> pyarrow.Table: # Need to provide schema if no result rows, as arrow can't infer # If ther are rows, it is safest to infer schema from batches. # Any discrepencies between predicted schema and actual schema will produce errors. - return pyarrow.Table.from_batches( - self.arrow_batches(), - self.schema.to_pyarrow() if not self.total_rows else None, - ) + batches = iter(self.arrow_batches()) + peek_it = itertools.islice(batches, 0, 1) + peek_value = list(peek_it) + # TODO: Enforce our internal schema on the table for consistency + if len(peek_value) > 0: + return pyarrow.Table.from_batches( + itertools.chain(peek_value, batches), # reconstruct + ) + else: + return self.schema.to_pyarrow().empty_table() class Executor(abc.ABC): diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e77319b551..83901562d3 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5283,6 +5283,16 @@ def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_cre assert not loaded_scalars_df_index.empty +def test_read_gbq_to_pandas_no_exec(unordered_session: bigframes.Session): + metrics = unordered_session._metrics + execs_pre = metrics.execution_count + df = unordered_session.read_gbq("bigquery-public-data.ml_datasets.penguins") + df.to_pandas() + execs_post = metrics.execution_count + assert df.shape == (344, 7) + assert execs_pre == execs_post + + def test_to_gbq_table_labels(scalars_df_index): destination_table = "bigframes-dev.bigframes_tests_sys.table_labels" result_table = scalars_df_index.to_gbq( From 102d363aa7e3245ff262c817bc756ea0eaee57e7 Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Tue, 22 Apr 2025 15:59:05 -0500 Subject: [PATCH 04/28] feat: support passing list of values to bigframes.core.sql.simple_literal (#1641) --- bigframes/core/sql.py | 8 +++++- tests/unit/core/test_sql.py | 57 +++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index d197993305..04e678e713 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -43,7 +43,9 @@ ### Writing SQL Values (literals, column references, table references, etc.) -def simple_literal(value: bytes | str | int | bool | float | datetime.datetime | None): +def simple_literal( + value: bytes | str | int | bool | float | datetime.datetime | list | None, +): """Return quoted input string.""" # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals @@ -80,6 +82,10 @@ def simple_literal(value: bytes | str | int | bool | float | datetime.datetime | elif isinstance(value, decimal.Decimal): # TODO: disambiguate BIGNUMERIC based on scale and/or precision return f"CAST('{str(value)}' AS NUMERIC)" + elif isinstance(value, list): + simple_literals = [simple_literal(i) for i in value] + return f"[{', '.join(simple_literals)}]" + else: raise ValueError(f"Cannot produce literal for {value}") diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py index 913a5b61fe..1b5c63e061 100644 --- a/tests/unit/core/test_sql.py +++ b/tests/unit/core/test_sql.py @@ -73,6 +73,63 @@ def test_simple_literal(value, expected_pattern): assert re.match(expected_pattern, got) is not None +@pytest.mark.parametrize( + ("value", "expected"), + ( + # Try to have some list of literals for each scalar data type: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + ([None, None], "[NULL, NULL]"), + ([True, False], "[True, False]"), + ( + [b"\x01\x02\x03ABC", b"\x01\x02\x03ABC"], + "[b'\\x01\\x02\\x03ABC', b'\\x01\\x02\\x03ABC']", + ), + ( + [datetime.date(2025, 1, 1), datetime.date(2025, 1, 1)], + "[DATE('2025-01-01'), DATE('2025-01-01')]", + ), + ( + [datetime.datetime(2025, 1, 2, 3, 45, 6, 789123)], + "[DATETIME('2025-01-02T03:45:06.789123')]", + ), + ( + [shapely.Point(0, 1), shapely.Point(0, 2)], + "[ST_GEOGFROMTEXT('POINT (0 1)'), ST_GEOGFROMTEXT('POINT (0 2)')]", + ), + # TODO: INTERVAL type (e.g. from dateutil.relativedelta) + # TODO: JSON type (TBD what Python object that would correspond to) + ([123, 456], "[123, 456]"), + ( + [decimal.Decimal("123.75"), decimal.Decimal("456.78")], + "[CAST('123.75' AS NUMERIC), CAST('456.78' AS NUMERIC)]", + ), + # TODO: support BIGNUMERIC by looking at precision/scale of the DECIMAL + ([123.75, 456.78], "[123.75, 456.78]"), + # TODO: support RANGE type + (["abc", "def"], "['abc', 'def']"), + # TODO: support STRUCT type (possibly another method?) + ( + [datetime.time(12, 34, 56, 789123), datetime.time(11, 25, 56, 789123)], + "[TIME(DATETIME('1970-01-01 12:34:56.789123')), TIME(DATETIME('1970-01-01 11:25:56.789123'))]", + ), + ( + [ + datetime.datetime( + 2025, 1, 2, 3, 45, 6, 789123, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2025, 2, 1, 4, 45, 6, 789123, tzinfo=datetime.timezone.utc + ), + ], + "[TIMESTAMP('2025-01-02T03:45:06.789123+00:00'), TIMESTAMP('2025-02-01T04:45:06.789123+00:00')]", + ), + ), +) +def test_simple_literal_w_list(value: list, expected: str): + got = sql.simple_literal(value) + assert got == expected + + def test_create_vector_search_sql_simple(): result_query = sql.create_vector_search_sql( sql_string="SELECT embedding FROM my_embeddings_table WHERE id = 1", From f67aa414bb2247e415710b2f5e8ea2f29de7b19e Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:20:43 -0700 Subject: [PATCH 05/28] chore: remove read_gbq for large tables test. (#1646) --- tests/system/large/test_session.py | 37 ------------------------------ 1 file changed, 37 deletions(-) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index 90955f5ddf..d28146498d 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -23,43 +23,6 @@ import bigframes.session._io.bigquery -@pytest.mark.parametrize( - ("query_or_table", "index_col"), - [ - pytest.param( - "bigquery-public-data.patents_view.ipcr_201708", - (), - id="1g_table_w_default_index", - ), - pytest.param( - "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2011", - (), - id="30g_table_w_default_index", - ), - # TODO(chelsealin): Disable the long run tests until we have propertily - # ordering support to avoid materializating any data. - # # Adding default index to large tables would take much longer time, - # # e.g. ~5 mins for a 100G table, ~20 mins for a 1T table. - # pytest.param( - # "bigquery-public-data.stackoverflow.post_history", - # ["id"], - # id="100g_table_w_unique_column_index", - # ), - # pytest.param( - # "bigquery-public-data.wise_all_sky_data_release.all_wise", - # ["cntr"], - # id="1t_table_w_unique_column_index", - # ), - ], -) -def test_read_gbq_for_large_tables( - session: bigframes.Session, query_or_table, index_col -): - """Verify read_gbq() is able to read large tables.""" - df = session.read_gbq(query_or_table, index_col=index_col) - assert len(df.columns) != 0 - - def test_close(session: bigframes.Session): # we will create two tables and confirm that they are deleted # when the session is closed From 26ae5e7e769c5bde2d02063f7a475ed5024b6054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 22 Apr 2025 19:28:36 -0500 Subject: [PATCH 06/28] chore: add tests for `to_gbq` with flexible column names (#1645) * chore: add tests for `to_gbq` with flexible column names * just numbers --- tests/system/small/test_dataframe_io.py | 83 +++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index a69c26bc54..e12db3f598 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -552,6 +552,89 @@ def test_to_gbq_w_duplicate_column_names( ) +def test_to_gbq_w_flexible_column_names( + scalars_df_index, dataset_id: str, bigquery_client +): + """Test the `to_gbq` API when dealing with flexible column names. + + This test is for BigQuery-backed storage nodes. + + See: https://cloud.google.com/bigquery/docs/schemas#flexible-column-names + """ + destination_table = f"{dataset_id}.test_to_gbq_w_flexible_column_names" + renamed_columns = { + # First column in Japanese (tests unicode). + "bool_col": "最初のカラム", + "bytes_col": "col with space", + # Dots aren't allowed in BigQuery column names, so these should be translated + "date_col": "col.with.dots", + "datetime_col": "col-with-hyphens", + "geography_col": "1start_with_number", + "int64_col": "col_with_underscore", + # Just numbers. + "int64_too": "123", + } + bf_df = scalars_df_index[renamed_columns.keys()].rename(columns=renamed_columns) + assert list(bf_df.columns) == list(renamed_columns.values()) + bf_df.to_gbq(destination_table, index=False) + + table = bigquery_client.get_table(destination_table) + columns = [field.name for field in table.schema] + assert columns == [ + "最初のカラム", + "col with space", + # Dots aren't allowed in BigQuery column names, so these should be translated + "col_with_dots", + "col-with-hyphens", + "1start_with_number", + "col_with_underscore", + "123", + ] + + +def test_to_gbq_w_flexible_column_names_local_node( + session, dataset_id: str, bigquery_client +): + """Test the `to_gbq` API when dealing with flexible column names. + + This test is for local nodes, e.g. read_pandas(), since those may go through + a different code path compared to data that starts in BigQuery. + + See: https://cloud.google.com/bigquery/docs/schemas#flexible-column-names + """ + destination_table = f"{dataset_id}.test_to_gbq_w_flexible_column_names_local_node" + + data = { + # First column in Japanese (tests unicode). + "最初のカラム": [1, 2, 3], + "col with space": [4, 5, 6], + # Dots aren't allowed in BigQuery column names, so these should be translated + "col.with.dots": [7, 8, 9], + "col-with-hyphens": [10, 11, 12], + "1start_with_number": [13, 14, 15], + "col_with_underscore": [16, 17, 18], + "123": [19, 20, 21], + } + pd_df = pd.DataFrame(data) + assert list(pd_df.columns) == list(data.keys()) + bf_df = session.read_pandas(pd_df) + assert list(bf_df.columns) == list(data.keys()) + bf_df.to_gbq(destination_table, index=False) + + table = bigquery_client.get_table(destination_table) + columns = [field.name for field in table.schema] + assert columns == [ + "最初のカラム", + "col with space", + # Dots aren't allowed in BigQuery column names, so these should be translated + "col_with_dots", + "col-with-hyphens", + "1start_with_number", + "col_with_underscore", + "123", + ] + + def test_to_gbq_w_None_column_names( scalars_df_index, scalars_pandas_df_index, dataset_id ): From ae83e61c49ade64d6f727e9f364bd2f1aeec6e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 23 Apr 2025 17:37:56 -0500 Subject: [PATCH 07/28] deps: support shapely 1.8.5+ again (#1651) --- bigframes/bigquery/_operations/geo.py | 12 ++++++--- bigframes/dtypes.py | 2 +- setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/bigquery/test_geo.py | 19 +++++++++++++ tests/unit/core/test_dtypes.py | 8 +++--- tests/unit/core/test_sql.py | 36 ++++++++++++++----------- 7 files changed, 55 insertions(+), 26 deletions(-) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 262dca6d6b..0856baabea 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -106,7 +106,9 @@ def st_area( def st_difference( series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], other: Union[ - bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry + bigframes.series.Series, + bigframes.geopandas.GeoSeries, + shapely.geometry.base.BaseGeometry, ], ) -> bigframes.series.Series: """ @@ -207,7 +209,9 @@ def st_difference( def st_distance( series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], other: Union[ - bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry + bigframes.series.Series, + bigframes.geopandas.GeoSeries, + shapely.geometry.base.BaseGeometry, ], *, use_spheroid: bool = False, @@ -282,7 +286,9 @@ def st_distance( def st_intersection( series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], other: Union[ - bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry + bigframes.series.Series, + bigframes.geopandas.GeoSeries, + shapely.geometry.base.BaseGeometry, ], ) -> bigframes.series.Series: """ diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index de6c331043..eff9d02c5c 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -607,7 +607,7 @@ def _infer_dtype_from_python_type(type_: type) -> Dtype: return DATE_DTYPE if issubclass(type_, datetime.time): return TIME_DTYPE - if issubclass(type_, shapely.Geometry): + if issubclass(type_, shapely.geometry.base.BaseGeometry): return GEO_DTYPE else: raise TypeError( diff --git a/setup.py b/setup.py index 1fe7006860..edc77e11b6 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", - "shapely >=2.0.0", + "shapely >=1.8.5", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index b0537cd035..dff245d176 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,7 +19,7 @@ pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 -shapely==2.0.0 +shapely==1.8.5 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py index fa2c522109..be517fb5cc 100644 --- a/tests/system/small/bigquery/test_geo.py +++ b/tests/system/small/bigquery/test_geo.py @@ -15,6 +15,7 @@ import geopandas # type: ignore import pandas as pd import pandas.testing +import pytest from shapely.geometry import ( # type: ignore GeometryCollection, LineString, @@ -94,6 +95,12 @@ def test_geo_st_difference_with_geometry_objects(): def test_geo_st_difference_with_single_geometry_object(): + pytest.importorskip( + "shapely", + minversion="2.0.0", + reason="shapely objects must be hashable to include in our expression trees", + ) + data1 = [ Polygon([(0, 0), (10, 0), (10, 10), (0, 10), (0, 0)]), Polygon([(0, 1), (10, 1), (10, 9), (0, 9), (0, 1)]), @@ -205,6 +212,12 @@ def test_geo_st_distance_with_geometry_objects(): def test_geo_st_distance_with_single_geometry_object(): + pytest.importorskip( + "shapely", + minversion="2.0.0", + reason="shapely objects must be hashable to include in our expression trees", + ) + data1 = [ # 0.00001 is approximately 1 meter. Polygon([(0, 0), (0.00001, 0), (0.00001, 0.00001), (0, 0.00001), (0, 0)]), @@ -279,6 +292,12 @@ def test_geo_st_intersection_with_geometry_objects(): def test_geo_st_intersection_with_single_geometry_object(): + pytest.importorskip( + "shapely", + minversion="2.0.0", + reason="shapely objects must be hashable to include in our expression trees", + ) + data1 = [ Polygon([(0, 0), (10, 0), (10, 10), (0, 10), (0, 0)]), Polygon([(0, 1), (10, 1), (10, 9), (0, 9), (0, 1)]), diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index bbeac3602b..37658bc436 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -20,7 +20,7 @@ import pandas as pd import pyarrow as pa # type: ignore import pytest -import shapely # type: ignore +import shapely.geometry # type: ignore import bigframes.core.compile.ibis_types import bigframes.dtypes @@ -231,9 +231,9 @@ def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str): (bool, bigframes.dtypes.BOOL_DTYPE), (int, bigframes.dtypes.INT_DTYPE), (str, bigframes.dtypes.STRING_DTYPE), - (shapely.Point, bigframes.dtypes.GEO_DTYPE), - (shapely.Polygon, bigframes.dtypes.GEO_DTYPE), - (shapely.Geometry, bigframes.dtypes.GEO_DTYPE), + (shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE), + (shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE), + (shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE), ], ) def test_bigframes_type_supports_python_types(python_type, expected_dtype): diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py index 1b5c63e061..17da3008fc 100644 --- a/tests/unit/core/test_sql.py +++ b/tests/unit/core/test_sql.py @@ -74,43 +74,45 @@ def test_simple_literal(value, expected_pattern): @pytest.mark.parametrize( - ("value", "expected"), + ("value", "expected_pattern"), ( # Try to have some list of literals for each scalar data type: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - ([None, None], "[NULL, NULL]"), - ([True, False], "[True, False]"), + ([None, None], re.escape("[NULL, NULL]")), + ([True, False], re.escape("[True, False]")), ( [b"\x01\x02\x03ABC", b"\x01\x02\x03ABC"], - "[b'\\x01\\x02\\x03ABC', b'\\x01\\x02\\x03ABC']", + re.escape("[b'\\x01\\x02\\x03ABC', b'\\x01\\x02\\x03ABC']"), ), ( [datetime.date(2025, 1, 1), datetime.date(2025, 1, 1)], - "[DATE('2025-01-01'), DATE('2025-01-01')]", + re.escape("[DATE('2025-01-01'), DATE('2025-01-01')]"), ), ( [datetime.datetime(2025, 1, 2, 3, 45, 6, 789123)], - "[DATETIME('2025-01-02T03:45:06.789123')]", + re.escape("[DATETIME('2025-01-02T03:45:06.789123')]"), ), ( - [shapely.Point(0, 1), shapely.Point(0, 2)], - "[ST_GEOGFROMTEXT('POINT (0 1)'), ST_GEOGFROMTEXT('POINT (0 2)')]", + [shapely.geometry.Point(0, 1), shapely.geometry.Point(0, 2)], + r"\[ST_GEOGFROMTEXT\('POINT \(0[.]?0* 1[.]?0*\)'\), ST_GEOGFROMTEXT\('POINT \(0[.]?0* 2[.]?0*\)'\)\]", ), # TODO: INTERVAL type (e.g. from dateutil.relativedelta) # TODO: JSON type (TBD what Python object that would correspond to) - ([123, 456], "[123, 456]"), + ([123, 456], re.escape("[123, 456]")), ( [decimal.Decimal("123.75"), decimal.Decimal("456.78")], - "[CAST('123.75' AS NUMERIC), CAST('456.78' AS NUMERIC)]", + re.escape("[CAST('123.75' AS NUMERIC), CAST('456.78' AS NUMERIC)]"), ), # TODO: support BIGNUMERIC by looking at precision/scale of the DECIMAL - ([123.75, 456.78], "[123.75, 456.78]"), + ([123.75, 456.78], re.escape("[123.75, 456.78]")), # TODO: support RANGE type - (["abc", "def"], "['abc', 'def']"), + (["abc", "def"], re.escape("['abc', 'def']")), # TODO: support STRUCT type (possibly another method?) ( [datetime.time(12, 34, 56, 789123), datetime.time(11, 25, 56, 789123)], - "[TIME(DATETIME('1970-01-01 12:34:56.789123')), TIME(DATETIME('1970-01-01 11:25:56.789123'))]", + re.escape( + "[TIME(DATETIME('1970-01-01 12:34:56.789123')), TIME(DATETIME('1970-01-01 11:25:56.789123'))]" + ), ), ( [ @@ -121,13 +123,15 @@ def test_simple_literal(value, expected_pattern): 2025, 2, 1, 4, 45, 6, 789123, tzinfo=datetime.timezone.utc ), ], - "[TIMESTAMP('2025-01-02T03:45:06.789123+00:00'), TIMESTAMP('2025-02-01T04:45:06.789123+00:00')]", + re.escape( + "[TIMESTAMP('2025-01-02T03:45:06.789123+00:00'), TIMESTAMP('2025-02-01T04:45:06.789123+00:00')]" + ), ), ), ) -def test_simple_literal_w_list(value: list, expected: str): +def test_simple_literal_w_list(value: list, expected_pattern: str): got = sql.simple_literal(value) - assert got == expected + assert re.match(expected_pattern, got) is not None def test_create_vector_search_sql_simple(): From e84f232b0fc5e2167a7cddb355cf0c8837ae5422 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 23 Apr 2025 16:52:15 -0700 Subject: [PATCH 08/28] feat: Short circuit query for local scan (#1618) --- bigframes/core/local_data.py | 103 ++++++++++++++--------- bigframes/core/rewrite/__init__.py | 6 +- bigframes/core/rewrite/scan_reduction.py | 19 ++++- bigframes/dtypes.py | 15 +++- bigframes/session/bq_caching_executor.py | 3 +- bigframes/session/local_scan_executor.py | 68 +++++++++++++++ tests/system/small/bigquery/test_json.py | 2 +- tests/system/small/test_session.py | 9 +- tests/unit/test_local_engine.py | 2 +- 9 files changed, 176 insertions(+), 51 deletions(-) create mode 100644 bigframes/session/local_scan_executor.py diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index e5c67fcf43..1330d04589 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -31,7 +31,6 @@ import pyarrow.parquet # type: ignore import bigframes.core.schema as schemata -import bigframes.core.utils as utils import bigframes.dtypes @@ -79,7 +78,7 @@ def from_pandas(cls, dataframe: pd.DataFrame) -> ManagedArrowTable: mat = ManagedArrowTable( pa.table(columns, names=column_names), schemata.ArraySchema(tuple(fields)) ) - mat.validate(include_content=True) + mat.validate() return mat @classmethod @@ -98,15 +97,14 @@ def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable: mat.validate() return mat - def to_parquet( + def to_pyarrow_table( self, - dst: Union[str, io.IOBase], *, offsets_col: Optional[str] = None, geo_format: Literal["wkb", "wkt"] = "wkt", duration_type: Literal["int", "duration"] = "duration", json_type: Literal["string"] = "string", - ): + ) -> pa.Table: pa_table = self.data if offsets_col is not None: pa_table = pa_table.append_column( @@ -119,6 +117,23 @@ def to_parquet( f"duration as {duration_type} not yet implemented" ) assert json_type == "string" + return pa_table + + def to_parquet( + self, + dst: Union[str, io.IOBase], + *, + offsets_col: Optional[str] = None, + geo_format: Literal["wkb", "wkt"] = "wkt", + duration_type: Literal["int", "duration"] = "duration", + json_type: Literal["string"] = "string", + ): + pa_table = self.to_pyarrow_table( + offsets_col=offsets_col, + geo_format=geo_format, + duration_type=duration_type, + json_type=json_type, + ) pyarrow.parquet.write_table(pa_table, where=dst) def itertuples( @@ -142,7 +157,7 @@ def itertuples( ): yield tuple(row_dict.values()) - def validate(self, include_content: bool = False): + def validate(self): for bf_field, arrow_field in zip(self.schema.items, self.data.schema): expected_arrow_type = _get_managed_storage_type(bf_field.dtype) arrow_type = arrow_field.type @@ -151,38 +166,6 @@ def validate(self, include_content: bool = False): f"Field {bf_field} has arrow array type: {arrow_type}, expected type: {expected_arrow_type}" ) - if include_content: - for batch in self.data.to_batches(): - for field in self.schema.items: - _validate_content(batch.column(field.column), field.dtype) - - -def _validate_content(array: pa.Array, dtype: bigframes.dtypes.Dtype): - """ - Recursively validates the content of a PyArrow Array based on the - expected BigFrames dtype, focusing on complex types like JSON, structs, - and arrays where the Arrow type alone isn't sufficient. - """ - # TODO: validate GEO data context. - if dtype == bigframes.dtypes.JSON_DTYPE: - values = array.to_pandas() - for data in values: - # Skip scalar null values to avoid `TypeError` from json.load. - if not utils.is_list_like(data) and pd.isna(data): - continue - try: - # Attempts JSON parsing. - json.loads(data) - except json.JSONDecodeError as e: - raise ValueError(f"Invalid JSON format found: {data!r}") from e - elif bigframes.dtypes.is_struct_like(dtype): - for field_name, dtype in bigframes.dtypes.get_struct_fields(dtype).items(): - _validate_content(array.field(field_name), dtype) - elif bigframes.dtypes.is_array_like(dtype): - return _validate_content( - array.flatten(), bigframes.dtypes.get_array_inner_type(dtype) - ) - # Sequential iterator, but could split into batches and leverage parallelism for speed def _iter_table( @@ -280,6 +263,34 @@ def _adapt_pandas_series( def _adapt_arrow_array( array: Union[pa.ChunkedArray, pa.Array] ) -> tuple[Union[pa.ChunkedArray, pa.Array], bigframes.dtypes.Dtype]: + """Normalize the array to managed storage types. Preverse shapes, only transforms values.""" + if pa.types.is_struct(array.type): + assert isinstance(array, pa.StructArray) + assert isinstance(array.type, pa.StructType) + arrays = [] + dtypes = [] + pa_fields = [] + for i in range(array.type.num_fields): + field_array, field_type = _adapt_arrow_array(array.field(i)) + arrays.append(field_array) + dtypes.append(field_type) + pa_fields.append(pa.field(array.type.field(i).name, field_array.type)) + struct_array = pa.StructArray.from_arrays( + arrays=arrays, fields=pa_fields, mask=array.is_null() + ) + dtype = bigframes.dtypes.struct_type( + [(field.name, dtype) for field, dtype in zip(pa_fields, dtypes)] + ) + return struct_array, dtype + if pa.types.is_list(array.type): + assert isinstance(array, pa.ListArray) + values, values_type = _adapt_arrow_array(array.values) + new_value = pa.ListArray.from_arrays( + array.offsets, values, mask=array.is_null() + ) + return new_value.fill_null([]), bigframes.dtypes.list_type(values_type) + if array.type == bigframes.dtypes.JSON_ARROW_TYPE: + return _canonicalize_json(array), bigframes.dtypes.JSON_DTYPE target_type = _logical_type_replacements(array.type) if target_type != array.type: # TODO: Maybe warn if lossy conversion? @@ -292,6 +303,22 @@ def _adapt_arrow_array( return array, bf_type +def _canonicalize_json(array: pa.Array) -> pa.Array: + def _canonicalize_scalar(json_string): + if json_string is None: + return None + # This is the canonical form that bq uses when emitting json + # The sorted keys and unambiguous whitespace ensures a 1:1 mapping + # between syntax and semantics. + return json.dumps( + json.loads(json_string), sort_keys=True, separators=(",", ":") + ) + + return pa.array( + [_canonicalize_scalar(value) for value in array.to_pylist()], type=pa.string() + ) + + def _get_managed_storage_type(dtype: bigframes.dtypes.Dtype) -> pa.DataType: if dtype in _MANAGED_STORAGE_TYPES_OVERRIDES.keys(): return _MANAGED_STORAGE_TYPES_OVERRIDES[dtype] diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 555199dcd9..e9a94069f5 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -17,7 +17,10 @@ from bigframes.core.rewrite.legacy_align import legacy_join_as_projection from bigframes.core.rewrite.order import bake_order, defer_order from bigframes.core.rewrite.pruning import column_pruning -from bigframes.core.rewrite.scan_reduction import try_reduce_to_table_scan +from bigframes.core.rewrite.scan_reduction import ( + try_reduce_to_local_scan, + try_reduce_to_table_scan, +) from bigframes.core.rewrite.slices import pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions from bigframes.core.rewrite.windows import rewrite_range_rolling @@ -34,4 +37,5 @@ "rewrite_range_rolling", "try_reduce_to_table_scan", "bake_order", + "try_reduce_to_local_scan", ] diff --git a/bigframes/core/rewrite/scan_reduction.py b/bigframes/core/rewrite/scan_reduction.py index be8db4827c..b9050c0c34 100644 --- a/bigframes/core/rewrite/scan_reduction.py +++ b/bigframes/core/rewrite/scan_reduction.py @@ -28,6 +28,20 @@ def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTab return None +def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLocalNode]: + if not all( + map( + lambda x: isinstance(x, (nodes.ReadLocalNode, nodes.SelectionNode)), + node.unique_nodes(), + ) + ): + return None + result = node.bottom_up(merge_scan) + if isinstance(result, nodes.ReadLocalNode): + return result + return None + + @functools.singledispatch def merge_scan(node: nodes.BigFrameNode) -> nodes.BigFrameNode: return node @@ -35,11 +49,12 @@ def merge_scan(node: nodes.BigFrameNode) -> nodes.BigFrameNode: @merge_scan.register def _(node: nodes.SelectionNode) -> nodes.BigFrameNode: - if not isinstance(node.child, nodes.ReadTableNode): + if not isinstance(node.child, (nodes.ReadTableNode, nodes.ReadLocalNode)): return node if node.has_multi_referenced_ids: return node - + if isinstance(node, nodes.ReadLocalNode) and node.offsets_col is not None: + return node selection = { aliased_ref.ref.id: aliased_ref.id for aliased_ref in node.input_output_pairs } diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index eff9d02c5c..ec115a93d0 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -19,7 +19,7 @@ import decimal import textwrap import typing -from typing import Any, Dict, List, Literal, Union +from typing import Any, Dict, List, Literal, Sequence, Union import bigframes_vendored.constants as constants import db_dtypes # type: ignore @@ -370,6 +370,19 @@ def get_array_inner_type(type_: ExpressionType) -> Dtype: return arrow_dtype_to_bigframes_dtype(list_type.value_type) +def list_type(values_type: Dtype) -> Dtype: + """Create a list dtype with given value type.""" + return pd.ArrowDtype(pa.list_(bigframes_dtype_to_arrow_dtype(values_type))) + + +def struct_type(fields: Sequence[tuple[str, Dtype]]) -> Dtype: + """Create a struct dtype with give fields names and types.""" + pa_fields = [ + pa.field(str, bigframes_dtype_to_arrow_dtype(dtype)) for str, dtype in fields + ] + return pd.ArrowDtype(pa.struct(pa_fields)) + + _ORDERABLE_SIMPLE_TYPES = set( mapping.dtype for mapping in SIMPLE_TYPES if mapping.orderable ) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 983b1918f5..584b41452a 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -35,7 +35,7 @@ import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.features -from bigframes.session import executor, read_api_execution +from bigframes.session import executor, local_scan_executor, read_api_execution import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics import bigframes.session.planner @@ -84,6 +84,7 @@ def __init__( bqstoragereadclient=bqstoragereadclient, project=self.bqclient.project, ), + local_scan_executor.LocalScanExecutor(), ) def to_sql( diff --git a/bigframes/session/local_scan_executor.py b/bigframes/session/local_scan_executor.py new file mode 100644 index 0000000000..3ed1b759c5 --- /dev/null +++ b/bigframes/session/local_scan_executor.py @@ -0,0 +1,68 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Optional + +from bigframes.core import bigframe_node, rewrite +from bigframes.session import executor, semi_executor + + +class LocalScanExecutor(semi_executor.SemiExecutor): + """ + Executes plans reducible to a arrow table scan. + """ + + def execute( + self, + plan: bigframe_node.BigFrameNode, + ordered: bool, + peek: Optional[int] = None, + ) -> Optional[executor.ExecuteResult]: + node = rewrite.try_reduce_to_local_scan(plan) + if not node: + return None + + # TODO: Can support some slicing, sorting + def iterator_supplier(): + offsets_col = ( + node.offsets_col.sql if (node.offsets_col is not None) else None + ) + arrow_table = node.local_data_source.to_pyarrow_table( + offsets_col=offsets_col + ) + if peek: + arrow_table = arrow_table.slice(0, peek) + + needed_cols = [item.source_id for item in node.scan_list.items] + if offsets_col is not None: + needed_cols.append(offsets_col) + + arrow_table = arrow_table.select(needed_cols) + arrow_table = arrow_table.rename_columns( + {item.source_id: item.id.sql for item in node.scan_list.items} + ) + yield from arrow_table.to_batches() + + total_rows = node.row_count + if (peek is not None) and (total_rows is not None): + total_rows = min(peek, total_rows) + + return executor.ExecuteResult( + arrow_batches=iterator_supplier, + schema=plan.schema, + query_job=None, + total_bytes=None, + total_rows=total_rows, + ) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 00f690ed54..df5a524b55 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -66,7 +66,7 @@ def test_json_set_w_more_pairs(): s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])] ) - expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] + expected_json = ['{"a": 3,"b":2}', '{"a":4,"b": 2}', '{"a": 5,"b":2,"c":1}'] expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index c7e7fa3573..668c764645 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import io +import json import random import re import tempfile @@ -32,7 +33,6 @@ import pytest import bigframes -import bigframes.core.indexes.base import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model @@ -990,10 +990,7 @@ def test_read_pandas_json_series_w_invalid_json(session, write_engine): ] pd_s = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) - with pytest.raises( - ValueError, - match="Invalid JSON format found", - ): + with pytest.raises(json.JSONDecodeError): session.read_pandas(pd_s, write_engine=write_engine) @@ -1101,7 +1098,7 @@ def test_read_pandas_w_nested_invalid_json(session, write_engine): ), ) - with pytest.raises(ValueError, match="Invalid JSON format found"): + with pytest.raises(json.JSONDecodeError): session.read_pandas(pd_s, write_engine=write_engine) diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index d4e0dae1f3..b4672d07a9 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -41,7 +41,7 @@ def small_inline_frame() -> pd.DataFrame: "bools": pd.Series([True, None, False], dtype="boolean"), "strings": pd.Series(["b", "aa", "ccc"], dtype="string[pyarrow]"), "intLists": pd.Series( - [[1, 2, 3], [4, 5, 6, 7], None], + [[1, 2, 3], [4, 5, 6, 7], []], dtype=pd.ArrowDtype(pa.list_(pa.int64())), ), }, From 9662745265c8c6e42f372629bd2c7806542cee1a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 24 Apr 2025 12:14:44 -0700 Subject: [PATCH 09/28] fix: Remove itertools.pairwise usage (#1638) --- bigframes/core/local_data.py | 15 +++++++++++- tests/unit/test_local_data.py | 46 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_local_data.py diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 1330d04589..baa29ba72b 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -212,7 +212,7 @@ def _( value_generator = iter_array( array.flatten(), bigframes.dtypes.get_array_inner_type(dtype) ) - for (start, end) in itertools.pairwise(array.offsets): + for (start, end) in _pairwise(array.offsets): arr_size = end.as_py() - start.as_py() yield list(itertools.islice(value_generator, arr_size)) @@ -389,3 +389,16 @@ def _physical_type_replacements(dtype: pa.DataType) -> pa.DataType: if dtype in _ARROW_MANAGED_STORAGE_OVERRIDES: return _ARROW_MANAGED_STORAGE_OVERRIDES[dtype] return dtype + + +def _pairwise(iterable): + do_yield = False + a = None + b = None + for item in iterable: + a = b + b = item + if do_yield: + yield (a, b) + else: + do_yield = True diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py new file mode 100644 index 0000000000..9cd08787c9 --- /dev/null +++ b/tests/unit/test_local_data.py @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import pandas.testing +import pyarrow as pa + +from bigframes import dtypes +from bigframes.core import local_data + +pd_data = pd.DataFrame( + { + "ints": [10, 20, 30, 40], + "nested_ints": [[1, 2], [3, 4, 5], [], [20, 30]], + "structs": [{"a": 100}, {}, {"b": 200}, {"b": 300}], + } +) + +pd_data_normalized = pd.DataFrame( + { + "ints": pd.Series([10, 20, 30, 40], dtype=dtypes.INT_DTYPE), + "nested_ints": pd.Series( + [[1, 2], [3, 4, 5], [], [20, 30]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ), + "structs": pd.Series( + [{"a": 100}, {}, {"b": 200}, {"b": 300}], + dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.int64()})), + ), + } +) + + +def test_local_data_well_formed_round_trip(): + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + result = pd.DataFrame(local_entry.itertuples(), columns=pd_data.columns) + pandas.testing.assert_frame_equal(pd_data_normalized, result, check_dtype=False) From 9128c4a31dab487bc23f67c43380abd0beda5b1c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 24 Apr 2025 13:32:51 -0700 Subject: [PATCH 10/28] docs: add JSON data types notebook (#1647) --- .../array.ipynb} | 461 +++++------------ notebooks/data_types/json.ipynb | 451 ++++++++++++++++ notebooks/data_types/struct.ipynb | 483 ++++++++++++++++++ 3 files changed, 1064 insertions(+), 331 deletions(-) rename notebooks/{dataframes/struct_and_array_dtypes.ipynb => data_types/array.ipynb} (50%) create mode 100644 notebooks/data_types/json.ipynb create mode 100644 notebooks/data_types/struct.ipynb diff --git a/notebooks/dataframes/struct_and_array_dtypes.ipynb b/notebooks/data_types/array.ipynb similarity index 50% rename from notebooks/dataframes/struct_and_array_dtypes.ipynb rename to notebooks/data_types/array.ipynb index def65ee6ca..96c5da5ac6 100644 --- a/notebooks/dataframes/struct_and_array_dtypes.ipynb +++ b/notebooks/data_types/array.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Copyright 2024 Google LLC\n", + "# Copyright 2025 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -25,16 +25,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# A Guide to Array and Struct Data Types in BigQuery DataFrames" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Set up your environment\n", + "# Array Data Types\n", + "\n", + "In BigQuery, an [ARRAY](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#array_type) (also called a `repeated` column) is an ordered list of zero or more elements of the same, non-`NULL` data type. It's important to note that BigQuery `ARRAY`s cannot contain nested `ARRAY`s. BigQuery DataFrames represents BigQuery `ARRAY` types to `pandas.ArrowDtype(pa.list_(T))`, where `T` is the underlying Arrow type of the array elements.\n", "\n", - "To get started, follow the instructions in the notebooks within the `getting_started` folder to set up your environment. Once your environment is ready, you can import the necessary packages by running the following code:" + "This notebook illustrates how to work with `ARRAY` columns in BigQuery DataFrames. First, let's import the required packages and perform the necessary setup below." ] }, { @@ -45,6 +40,7 @@ "source": [ "import bigframes.pandas as bpd\n", "import bigframes.bigquery as bbq\n", + "import pandas as pd\n", "import pyarrow as pa" ] }, @@ -64,20 +60,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Array Data Types\n", - "\n", - "In BigQuery, an [array](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#array_type) (also called a repeated column) is an ordered list of zero or more elements of the same data type. Arrays cannot contain other arrays or `NULL` elements.\n", + "## Create DataFrames with an array column\n", "\n", - "BigQuery DataFrames map BigQuery array types to `pandas.ArrowDtype(pa.list_())`. The following code examples illustrate how to work with array columns in BigQuery DataFrames." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create DataFrames with array columns\n", - "\n", - "Create a DataFrame in BigQuery DataFrames from local sample data. Use a list of lists to create a column with the `list[pyarrow]` dtype, which corresponds to the `ARRAY` type in BigQuery." + "**Example 1: Creating from a list of lists/tuples**" ] }, { @@ -146,10 +131,13 @@ } ], "source": [ - "df = bpd.DataFrame({\n", - " 'Name': ['Alice', 'Bob', 'Charlie'],\n", - " 'Scores': [[95, 88, 92], [78, 81], [82, 89, 94, 100]],\n", - "})\n", + "names = [\"Alice\", \"Bob\", \"Charlie\"]\n", + "scores = [\n", + " [95, 88, 92],\n", + " [78, 81],\n", + " [82, 89, 94, 100]\n", + "]\n", + "df = bpd.DataFrame({\"Name\": names, \"Scores\": scores})\n", "df" ] }, @@ -179,9 +167,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Operate on array data\n", - "\n", - "While pandas offers vectorized operations and lambda expressions for array manipulation, BigQuery DataFrames leverages the computational power of BigQuery itself. You can access a variety of native BigQuery array operations, such as [`array_agg`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_array_agg) and [`array_length`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_array_length), through the [`bigframes.bigquery`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery) package (abbreviated as `bbq` in the following code samples)." + "**Example 2: Defining schema explicitly**" ] }, { @@ -192,10 +178,10 @@ { "data": { "text/plain": [ - "0 3\n", - "1 2\n", - "2 4\n", - "Name: Scores, dtype: Int64" + "0 [95. 88. 92.]\n", + "1 [78. 81.]\n", + "2 [ 82. 89. 94. 100.]\n", + "dtype: list[pyarrow]" ] }, "execution_count": 6, @@ -204,8 +190,14 @@ } ], "source": [ - "# Find the length in each array.\n", - "bbq.array_length(df['Scores'])" + "bpd.Series(data=scores, dtype=pd.ArrowDtype(pa.list_(pa.float64())))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 3: Reading from a source**" ] }, { @@ -216,10 +208,12 @@ { "data": { "text/plain": [ - "0 3\n", - "1 2\n", - "2 4\n", - "Name: Scores, dtype: Int64" + "0 [{'tables': {'score': 0.9349926710128784, 'val...\n", + "1 [{'tables': {'score': 0.9690881371498108, 'val...\n", + "2 [{'tables': {'score': 0.8667634129524231, 'val...\n", + "3 [{'tables': {'score': 0.9351968765258789, 'val...\n", + "4 [{'tables': {'score': 0.8572560548782349, 'val...\n", + "Name: predicted_default_payment_next_month, dtype: list>>[pyarrow]" ] }, "execution_count": 7, @@ -228,389 +222,188 @@ } ], "source": [ - "# Find the length of each array with list accessor\n", - "df['Scores'].list.len()" + "bpd.read_gbq(\"bigquery-public-data.ml_datasets.credit_card_default\", max_results=5)[\"predicted_default_payment_next_month\"]" ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 88\n", - "1 81\n", - "2 89\n", - "Name: Scores, dtype: Int64" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "# Find the second element in each array with list accessor\n", - "df['Scores'].list[1]" + "## Operate on `ARRAY` data\n", + "\n", + "BigQuery DataFrames provides two main approaches for operating on list (`ARRAY`) data:\n", + "\n", + "1. **The `Series.list` accessor**: Provides Pandas-like methods for array column manipulation.\n", + "2. **[BigQuery built-in functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions)**: Allows you to use functions mirroring BigQuery SQL operations, available through the `bigframes.bigquery` module (abbreviated as `bbq` below), such as [`array_agg`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_array_agg) and [`array_length`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_array_length)." ] }, { - "cell_type": "code", - "execution_count": 9, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 95\n", - "0 88\n", - "0 92\n", - "1 78\n", - "1 81\n", - "2 82\n", - "2 89\n", - "2 94\n", - "2 100\n", - "Name: Scores, dtype: Int64" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "# Transforms array elements into individual rows, preserving original order when in ordering\n", - "# mode. If an array has multiple elements, exploded rows are ordered by the element's index\n", - "# within its original array.\n", - "scores = df['Scores'].explode()\n", - "scores" + "### Get the Length of Each Arrray\n", + "\n", + "**Example 1: Using list accessor to get array length**" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 100.0\n", - "0 93.0\n", - "0 97.0\n", - "1 83.0\n", - "1 86.0\n", - "2 87.0\n", - "2 94.0\n", - "2 99.0\n", - "2 105.0\n", - "Name: Scores, dtype: Float64" + "0 3\n", + "1 2\n", + "2 4\n", + "Name: Scores, dtype: Int64" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Adjust the scores.\n", - "adj_scores = scores + 5.0\n", - "adj_scores" + "df['Scores'].list.len()" ] }, { - "cell_type": "code", - "execution_count": 11, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 [100. 93. 97.]\n", - "1 [83. 86.]\n", - "2 [ 87. 94. 99. 105.]\n", - "Name: Scores, dtype: list[pyarrow]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "# Aggregate adjusted scores back into arrays.\n", - "adj_scores_arr = bbq.array_agg(adj_scores.groupby(level=0))\n", - "adj_scores_arr" + "**Example 2: Using BigQuery build-in functions to get array length**" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameScoresNewScores
0Alice[95 88 92][100. 93. 97.]
1Bob[78 81][83. 86.]
2Charlie[ 82 89 94 100][ 87. 94. 99. 105.]
\n", - "

3 rows × 3 columns

\n", - "
[3 rows x 3 columns in total]" - ], "text/plain": [ - " Name Scores NewScores\n", - "0 Alice [95 88 92] [100. 93. 97.]\n", - "1 Bob [78 81] [83. 86.]\n", - "2 Charlie [ 82 89 94 100] [ 87. 94. 99. 105.]\n", - "\n", - "[3 rows x 3 columns]" + "0 3\n", + "1 2\n", + "2 4\n", + "Name: Scores, dtype: Int64" ] }, - "execution_count": 12, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Add adjusted scores into the DataFrame. This operation requires an implicit join \n", - "# between the two tables, necessitating a unique index in the DataFrame (guaranteed \n", - "# in the default ordering and index mode).\n", - "df['NewScores'] = adj_scores_arr\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Struct Data Types\n", - "\n", - "In BigQuery, a [struct](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type) (also known as a `record`) is a collection of ordered fields, each with a defined data type (required) and an optional field name. BigQuery DataFrames maps BigQuery struct types to the pandas equivalent, `pandas.ArrowDtype(pa.struct())`. This section provides practical code examples illustrating how to use struct columns with BigQuery DataFrames." + "bbq.array_length(df['Scores'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Create DataFrames with struct columns \n", - "\n", - "Create a DataFrame with an `Address` struct column by using dictionaries for the data and setting the dtype to `struct[pyarrow]`." + "### Access Element at a Specific Index (e.g., First Element) " ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/google/cloud/bigquery/_pandas_helpers.py:570: UserWarning: Pyarrow could not determine the type of columns: bigframes_unnamed_index.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameAddress
0Alice{'City': 'New York', 'State': 'NY'}
1Bob{'City': 'San Francisco', 'State': 'CA'}
2Charlie{'City': 'Seattle', 'State': 'WA'}
\n", - "

3 rows × 2 columns

\n", - "
[3 rows x 2 columns in total]" - ], - "text/plain": [ - " Name Address\n", - "0 Alice {'City': 'New York', 'State': 'NY'}\n", - "1 Bob {'City': 'San Francisco', 'State': 'CA'}\n", - "2 Charlie {'City': 'Seattle', 'State': 'WA'}\n", - "\n", - "[3 rows x 2 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "names = bpd.Series(['Alice', 'Bob', 'Charlie'])\n", - "address = bpd.Series(\n", - " [\n", - " {'City': 'New York', 'State': 'NY'},\n", - " {'City': 'San Francisco', 'State': 'CA'},\n", - " {'City': 'Seattle', 'State': 'WA'}\n", - " ],\n", - " dtype=bpd.ArrowDtype(pa.struct(\n", - " [('City', pa.string()), ('State', pa.string())]\n", - " )))\n", - "\n", - "df = bpd.DataFrame({'Name': names, 'Address': address})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Name string[pyarrow]\n", - "Address struct[pyarrow]\n", - "dtype: object" + "0 95\n", + "1 78\n", + "2 82\n", + "Name: Scores, dtype: Int64" ] }, - "execution_count": 14, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.dtypes" + "df['Scores'].list[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Operate on struct data\n", + "### Explode/Unnest Array elements into Seperate Rows\n", "\n", - "Similar to pandas, BigQuery DataFrames provides a [`StructAccessor`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor). Use the methods provided in this accessor to manipulate struct data." + "The exploded rows preserving original order when in ordering mode. If an array has multiple elements, exploded rows are ordered by the element's index\n", + "within its original array. " ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "City string[pyarrow]\n", - "State string[pyarrow]\n", - "dtype: object" + "0 95\n", + "0 88\n", + "0 92\n", + "1 78\n", + "1 81\n", + "2 82\n", + "2 89\n", + "2 94\n", + "2 100\n", + "Name: Scores, dtype: Int64" ] }, - "execution_count": 15, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Return the dtype object of each child field of the struct.\n", - "df['Address'].struct.dtypes()" + "scores = df['Scores'].explode()\n", + "scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregate elements back into an array" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 New York\n", - "1 San Francisco\n", - "2 Seattle\n", - "Name: City, dtype: string" + "0 [100. 93. 97.]\n", + "1 [83. 86.]\n", + "2 [ 87. 94. 99. 105.]\n", + "Name: Scores, dtype: list[pyarrow]" ] }, - "execution_count": 16, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Extract a child field as a Series\n", - "city = df['Address'].struct.field(\"City\")\n", - "city" + "new_scores = scores + 5.0\n", + "new_scores_arr = bbq.array_agg(new_scores.groupby(level=0))\n", + "new_scores_arr" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -634,49 +427,55 @@ " \n", " \n", " \n", - " City\n", - " State\n", + " Name\n", + " Scores\n", + " NewScores\n", " \n", " \n", " \n", " \n", " 0\n", - " New York\n", - " NY\n", + " Alice\n", + " [95 88 92]\n", + " [100. 93. 97.]\n", " \n", " \n", " 1\n", - " San Francisco\n", - " CA\n", + " Bob\n", + " [78 81]\n", + " [83. 86.]\n", " \n", " \n", " 2\n", - " Seattle\n", - " WA\n", + " Charlie\n", + " [ 82 89 94 100]\n", + " [ 87. 94. 99. 105.]\n", " \n", " \n", "\n", - "

3 rows × 2 columns

\n", - "[3 rows x 2 columns in total]" + "

3 rows × 3 columns

\n", + "[3 rows x 3 columns in total]" ], "text/plain": [ - " City State\n", - "0 New York NY\n", - "1 San Francisco CA\n", - "2 Seattle WA\n", + " Name Scores NewScores\n", + "0 Alice [95 88 92] [100. 93. 97.]\n", + "1 Bob [78 81] [83. 86.]\n", + "2 Charlie [ 82 89 94 100] [ 87. 94. 99. 105.]\n", "\n", - "[3 rows x 2 columns]" + "[3 rows x 3 columns]" ] }, - "execution_count": 17, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Extract all child fields of a struct as a DataFrame.\n", - "address_df = df['Address'].struct.explode()\n", - "address_df" + "# Add adjusted scores into the DataFrame. This operation requires an implicit join \n", + "# between the two tables, necessitating a unique index in the DataFrame (guaranteed \n", + "# in the default ordering and index mode).\n", + "df['NewScores'] = new_scores_arr\n", + "df" ] } ], @@ -696,7 +495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/data_types/json.ipynb b/notebooks/data_types/json.ipynb new file mode 100644 index 0000000000..f0a8ed4ffe --- /dev/null +++ b/notebooks/data_types/json.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# JSON Data Types\n", + "\n", + "When using BigQuery DataFrames, columns containing data in BigQuery's [JSON](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#json_type) format (a lightweight standard) are represented as `pandas.ArrowDtype`. The exact underlying Arrow type depends on your library versions. Older environments typically use `db_dtypes.JSONArrowType()` for compatibility, which is an Arrow extension type acting as a light wrapper around `pa.string()`. In contrast, newer setups (pandas 3.0+ and pyarrow 19.0+) utilize the more recent `pa.json_(pa.string())` representation." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "import db_dtypes\n", + "import pandas as pd\n", + "import pyarrow as pa" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}\n", + "\n", + "bpd.options.display.progress_bar = None\n", + "bpd.options.bigquery.location = REGION" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Series with JSON columns\n", + "\n", + "**Example 1: Create a Series with a JSON dtype from local data**\n", + "\n", + "This example demonstrates creating a JSON Series from a list of JSON strings. Note that BigQuery standardizes these strings, for instance, by removing extra spaces and ordering dictionary keys. Specifying the `dtype` is essential; if omitted, a string-type Series will be generated." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 \"str\"\n", + "2 false\n", + "3 [\"a\",{\"b\":1},null]\n", + "4 {\"a\":{\"b\":[1,2,3],\"c\":true}}\n", + "5 \n", + "dtype: extension>[pyarrow]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json_data = [\n", + " \"1\",\n", + " '\"str\"',\n", + " \"false\",\n", + " '[\"a\", {\"b\": 1}, null]',\n", + " '{\"a\": {\"b\": [1, 2, 3], \"c\": true}}',\n", + " None,\n", + "]\n", + "bpd.Series(json_data, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 2: Create a Series with a Nested JSON dtype from local data**\n", + "\n", + "To create a BigQuery DataFrame Series containing `JSON` data nested within a `STRUCT` or `LIST` type, you must represent the `JSON` data in a `pa.array` defined with the `pa.string` type. This workaround is necessary because Pyarrow lacks support for creating structs or lists that directly contain extension types (see [issue](https://github.com/apache/arrow/issues/45262))." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'key': '1'}]\n", + "1 [{'key': None}]\n", + "2 [{'key': '[\"1\",\"3\",\"5\"]'}]\n", + "3 [{'key': '{\"a\":1,\"b\":[\"x\",\"y\"],\"c\":{\"x\":[],\"z\"...\n", + "dtype: list>>>[pyarrow]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list_data = [\n", + " [{\"key\": \"1\"}],\n", + " [{\"key\": None}],\n", + " [{\"key\": '[\"1\",\"3\",\"5\"]'}],\n", + " [{\"key\": '{\"a\":1,\"b\":[\"x\",\"y\"],\"c\":{\"x\":[],\"z\":false}}'}],\n", + "]\n", + "pa_array = pa.array(list_data, type=pa.list_(pa.struct([(\"key\", pa.string())])))\n", + "bpd.Series(\n", + " pd.arrays.ArrowExtensionArray(pa_array),\n", + " dtype=pd.ArrowDtype(\n", + " pa.list_(pa.struct([(\"key\", db_dtypes.JSONArrowType())])),\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 3: Create a Series with a Nested JSON dtype using BigQuery SQLs**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idstruct_col
01{'data': '{\"b\":100}', 'number': 2}
10{'data': '{\"a\":true}', 'number': 1}
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " id struct_col\n", + "0 1 {'data': '{\"b\":100}', 'number': 2}\n", + "1 0 {'data': '{\"a\":true}', 'number': 1}\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = \"\"\"\n", + "SELECT 0 AS id, STRUCT(JSON_OBJECT('a', True) AS data, 1 AS number) AS struct_col\n", + "UNION ALL\n", + "SELECT 1, STRUCT(JSON_OBJECT('b', 100), 2),\n", + "\"\"\"\n", + "df = bpd.read_gbq(sql)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id Int64\n", + "struct_col struct>,...\n", + "dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Operate on `JSON` data\n", + "\n", + "The `bigframes.bigquery` module (often abbreviated as `bbq`) provides access within BigQuery DataFrames to various **[BigQuery built-in functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions)**. Examples relevant for JSON data include [`json_extract`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_json_extract) and [`parse_json`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_parse_json)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract JSON data via specific JSON path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 1: When JSON data is represented as strings**" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "fruits = [\n", + " '{\"fruits\": [{\"name\": \"apple\"}, {\"name\": \"cherry\"}]}',\n", + " '{\"fruits\": [{\"name\": \"guava\"}, {\"name\": \"grapes\"}]}',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 {\"fruits\": [{\"name\": \"apple\"}, {\"name\": \"cherr...\n", + "1 {\"fruits\": [{\"name\": \"guava\"}, {\"name\": \"grape...\n", + "dtype: string" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_s = bpd.Series(fruits, dtype=\"string\")\n", + "str_s" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 {\"name\":\"apple\"}\n", + "1 {\"name\":\"guava\"}\n", + "dtype: string" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.json_extract(str_s, \"$.fruits[0]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 2: When JSON data is stored as JSON type**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 {\"fruits\":[{\"name\":\"apple\"},{\"name\":\"cherry\"}]}\n", + "1 {\"fruits\":[{\"name\":\"guava\"},{\"name\":\"grapes\"}]}\n", + "dtype: extension>[pyarrow]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json_s = bpd.Series(fruits, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()))\n", + "json_s" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 {\"name\":\"apple\"}\n", + "1 {\"name\":\"guava\"}\n", + "dtype: extension>[pyarrow]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.json_extract(json_s, \"$.fruits[0]\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract an array from JSON data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 ['{\"name\":\"apple\"}' '{\"name\":\"cherry\"}']\n", + "1 ['{\"name\":\"guava\"}' '{\"name\":\"grapes\"}']\n", + "dtype: list>>[pyarrow]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.json_extract_array(json_s, \"$.fruits\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 ['{\"name\":\"apple\"}' '{\"name\":\"cherry\"}']\n", + "1 ['{\"name\":\"guava\"}' '{\"name\":\"grapes\"}']\n", + "dtype: list[pyarrow]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.json_extract_array(str_s, \"$.fruits\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/data_types/struct.ipynb b/notebooks/data_types/struct.ipynb new file mode 100644 index 0000000000..74bf69d239 --- /dev/null +++ b/notebooks/data_types/struct.ipynb @@ -0,0 +1,483 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Struct Data Types\n", + "\n", + "In BigQuery, a [STRUCT](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type) (also known as a `record`) is a collection of ordered fields, each with a defined data type (required) and an optional field name. BigQuery DataFrames maps BigQuery `STRUCT` types to the pandas equivalent, `pandas.ArrowDtype(pa.struct())`. \n", + "\n", + "This notebook illustrates how to work with `STRUCT` columns in BigQuery DataFrames. First, let's import the required packages and perform the necessary setup below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "import pandas as pd\n", + "import pyarrow as pa" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}\n", + "\n", + "bpd.options.display.progress_bar = None\n", + "bpd.options.bigquery.location = REGION" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create DataFrames with struct columns\n", + "\n", + "**Example 1: Creating from a list of objects**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAddress
0Alice{'City': 'New York', 'State': 'NY'}
1Bob{'City': 'San Francisco', 'State': 'CA'}
2Charlie{'City': 'Seattle', 'State': 'WA'}
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " Name Address\n", + "0 Alice {'City': 'New York', 'State': 'NY'}\n", + "1 Bob {'City': 'San Francisco', 'State': 'CA'}\n", + "2 Charlie {'City': 'Seattle', 'State': 'WA'}\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names = [\"Alice\", \"Bob\", \"Charlie\"]\n", + "addresses = [\n", + " {'City': 'New York', 'State': 'NY'},\n", + " {'City': 'San Francisco', 'State': 'CA'},\n", + " {'City': 'Seattle', 'State': 'WA'}\n", + "]\n", + "df = bpd.DataFrame({'Name': names, 'Address': addresses})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Name string[pyarrow]\n", + "Address struct[pyarrow]\n", + "dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 2: Defining schema explicitly**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 {'City': 'New York', 'State': 'NY'}\n", + "1 {'City': 'San Francisco', 'State': 'CA'}\n", + "2 {'City': 'Seattle', 'State': 'WA'}\n", + "dtype: struct[pyarrow]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bpd.Series(\n", + " data=addresses, \n", + " dtype=bpd.ArrowDtype(pa.struct([('City', pa.string()), ('State', pa.string())]))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 3: Reading from a source**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'tables': {'score': 0.9349926710128784, 'val...\n", + "1 [{'tables': {'score': 0.9690881371498108, 'val...\n", + "2 [{'tables': {'score': 0.8667634129524231, 'val...\n", + "3 [{'tables': {'score': 0.9351968765258789, 'val...\n", + "4 [{'tables': {'score': 0.8572560548782349, 'val...\n", + "Name: predicted_default_payment_next_month, dtype: list>>[pyarrow]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bpd.read_gbq(\"bigquery-public-data.ml_datasets.credit_card_default\", max_results=5)[\"predicted_default_payment_next_month\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Operate on `STRUCT` data\n", + "\n", + "BigQuery DataFrames provides two main approaches for operating on `STRUCT` data:\n", + "\n", + "1. **[The `Series.struct` accessor](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor)**: Provides Pandas-like methods for STRUCT column manipulation.\n", + "2. **The `DataFrame.struct` accessor**: Provides Pandas-like methods for all child STRUCT columns manipulation.\n", + "3. **[BigQuery built-in functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions)**: Allows you to use functions mirroring BigQuery SQL operations, available through the `bigframes.bigquery` module (abbreviated as `bbq` below), such as [`struct`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_struct)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Data Types of Struct Fields" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "City string[pyarrow]\n", + "State string[pyarrow]\n", + "dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].struct.dtypes()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Access a Struct Field by Name" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 New York\n", + "1 San Francisco\n", + "2 Seattle\n", + "Name: City, dtype: string" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].struct.field(\"City\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract Struct Fields into a DataFrame\n", + "\n", + "**Example 1: Using Series `.struct` accessor**" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CityState
0New YorkNY
1San FranciscoCA
2SeattleWA
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " City State\n", + "0 New York NY\n", + "1 San Francisco CA\n", + "2 Seattle WA\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].struct.explode()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example 2: Using DataFrame `.struct` accessor while keeping other columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAddress.CityAddress.State
0AliceNew YorkNY
1BobSan FranciscoCA
2CharlieSeattleWA
\n", + "

3 rows × 3 columns

\n", + "
[3 rows x 3 columns in total]" + ], + "text/plain": [ + " Name Address.City Address.State\n", + "0 Alice New York NY\n", + "1 Bob San Francisco CA\n", + "2 Charlie Seattle WA\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.struct.explode(\"Address\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 8482bfc1d4caa91a35c4fbf0be420301d05ad544 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 24 Apr 2025 14:11:49 -0700 Subject: [PATCH 11/28] perf: Use flyweight for node fields (#1654) --- bigframes/core/bigframe_node.py | 5 +- bigframes/core/nodes.py | 80 ++++++++++++++---------- bigframes/core/schema.py | 7 ++- bigframes/core/sequences.py | 105 ++++++++++++++++++++++++++++++++ tests/unit/test_sequences.py | 55 +++++++++++++++++ 5 files changed, 214 insertions(+), 38 deletions(-) create mode 100644 bigframes/core/sequences.py create mode 100644 tests/unit/test_sequences.py diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py index 369e8f6329..5509adc0ea 100644 --- a/bigframes/core/bigframe_node.py +++ b/bigframes/core/bigframe_node.py @@ -20,10 +20,9 @@ import functools import itertools import typing -from typing import Callable, Dict, Generator, Iterable, Mapping, Set, Tuple +from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Set, Tuple from bigframes.core import identifiers -import bigframes.core.guid import bigframes.core.schema as schemata import bigframes.dtypes @@ -163,7 +162,7 @@ def roots(self) -> typing.Set[BigFrameNode]: # TODO: Store some local data lazily for select, aggregate nodes. @property @abc.abstractmethod - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: ... @property diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index d2c301b4ad..f7327f2a7a 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -33,7 +33,7 @@ import google.cloud.bigquery as bq -from bigframes.core import identifiers, local_data +from bigframes.core import identifiers, local_data, sequences from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field import bigframes.core.expression as ex from bigframes.core.ordering import OrderingExpression, RowOrdering @@ -87,7 +87,7 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.child,) @property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: return self.child.fields @property @@ -226,8 +226,8 @@ def added_fields(self) -> Tuple[Field, ...]: return (Field(self.indicator_col, bigframes.dtypes.BOOL_DTYPE, nullable=False),) @property - def fields(self) -> Iterable[Field]: - return itertools.chain( + def fields(self) -> Sequence[Field]: + return sequences.ChainedSequence( self.left_child.fields, self.added_fields, ) @@ -321,15 +321,15 @@ def order_ambiguous(self) -> bool: def explicitly_ordered(self) -> bool: return self.propogate_order - @property - def fields(self) -> Iterable[Field]: - left_fields = self.left_child.fields + @functools.cached_property + def fields(self) -> Sequence[Field]: + left_fields: Iterable[Field] = self.left_child.fields if self.type in ("right", "outer"): left_fields = map(lambda x: x.with_nullable(), left_fields) - right_fields = self.right_child.fields + right_fields: Iterable[Field] = self.right_child.fields if self.type in ("left", "outer"): right_fields = map(lambda x: x.with_nullable(), right_fields) - return itertools.chain(left_fields, right_fields) + return (*left_fields, *right_fields) @property def joins_nulls(self) -> bool: @@ -430,10 +430,10 @@ def explicitly_ordered(self) -> bool: return True @property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: # TODO: Output names should probably be aligned beforehand or be part of concat definition # TODO: Handle nullability - return ( + return tuple( Field(id, field.dtype) for id, field in zip(self.output_ids, self.children[0].fields) ) @@ -505,7 +505,7 @@ def explicitly_ordered(self) -> bool: return True @functools.cached_property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: return ( Field(self.output_id, next(iter(self.start.fields)).dtype, nullable=False), ) @@ -626,12 +626,20 @@ class ReadLocalNode(LeafNode): session: typing.Optional[bigframes.session.Session] = None @property - def fields(self) -> Iterable[Field]: - fields = (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) + def fields(self) -> Sequence[Field]: + fields = tuple( + Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items + ) if self.offsets_col is not None: - return itertools.chain( - fields, - (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE, nullable=False),), + return tuple( + itertools.chain( + fields, + ( + Field( + self.offsets_col, bigframes.dtypes.INT_DTYPE, nullable=False + ), + ), + ) ) return fields @@ -767,8 +775,8 @@ def session(self): return self.table_session @property - def fields(self) -> Iterable[Field]: - return ( + def fields(self) -> Sequence[Field]: + return tuple( Field(col_id, dtype, self.source.table.schema_by_id[source_id].is_nullable) for col_id, dtype, source_id in self.scan_list.items ) @@ -881,8 +889,8 @@ def non_local(self) -> bool: return True @property - def fields(self) -> Iterable[Field]: - return itertools.chain(self.child.fields, self.added_fields) + def fields(self) -> Sequence[Field]: + return sequences.ChainedSequence(self.child.fields, self.added_fields) @property def relation_ops_created(self) -> int: @@ -1097,7 +1105,7 @@ def _validate(self): raise ValueError(f"Reference to column not in child: {ref.id}") @functools.cached_property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: input_fields_by_id = {field.id: field for field in self.child.fields} return tuple( Field( @@ -1192,8 +1200,8 @@ def added_fields(self) -> Tuple[Field, ...]: return tuple(fields) @property - def fields(self) -> Iterable[Field]: - return itertools.chain(self.child.fields, self.added_fields) + def fields(self) -> Sequence[Field]: + return sequences.ChainedSequence(self.child.fields, self.added_fields) @property def variables_introduced(self) -> int: @@ -1263,7 +1271,7 @@ def non_local(self) -> bool: return True @property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: return (Field(self.col_id, bigframes.dtypes.INT_DTYPE, nullable=False),) @property @@ -1313,7 +1321,7 @@ def non_local(self) -> bool: return True @functools.cached_property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: # TODO: Use child nullability to infer grouping key nullability by_fields = (self.child.field_by_id[ref.id] for ref in self.by_column_ids) if self.dropna: @@ -1411,8 +1419,8 @@ def non_local(self) -> bool: return True @property - def fields(self) -> Iterable[Field]: - return itertools.chain(self.child.fields, [self.added_field]) + def fields(self) -> Sequence[Field]: + return sequences.ChainedSequence(self.child.fields, (self.added_field,)) @property def variables_introduced(self) -> int: @@ -1547,7 +1555,7 @@ def row_preserving(self) -> bool: return False @property - def fields(self) -> Iterable[Field]: + def fields(self) -> Sequence[Field]: fields = ( Field( field.id, @@ -1561,11 +1569,17 @@ def fields(self) -> Iterable[Field]: for field in self.child.fields ) if self.offsets_col is not None: - return itertools.chain( - fields, - (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE, nullable=False),), + return tuple( + itertools.chain( + fields, + ( + Field( + self.offsets_col, bigframes.dtypes.INT_DTYPE, nullable=False + ), + ), + ) ) - return fields + return tuple(fields) @property def relation_ops_created(self) -> int: diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index c4cbb51ef9..4f636ab210 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -17,11 +17,11 @@ from dataclasses import dataclass import functools import typing +from typing import Sequence import google.cloud.bigquery import pyarrow -import bigframes.core.guid import bigframes.dtypes ColumnIdentifierType = str @@ -35,7 +35,10 @@ class SchemaItem: @dataclass(frozen=True) class ArraySchema: - items: typing.Tuple[SchemaItem, ...] + items: Sequence[SchemaItem] + + def __iter__(self): + yield from self.items @classmethod def from_bq_table( diff --git a/bigframes/core/sequences.py b/bigframes/core/sequences.py new file mode 100644 index 0000000000..6f1b7e455b --- /dev/null +++ b/bigframes/core/sequences.py @@ -0,0 +1,105 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections.abc +import functools +import itertools +from typing import Iterable, Iterator, Sequence, TypeVar + +ColumnIdentifierType = str + + +T = TypeVar("T") + +# Further optimizations possible: +# * Support mapping operators +# * Support insertions and deletions + + +class ChainedSequence(collections.abc.Sequence[T]): + """ + Memory-optimized sequence from composing chain of existing sequences. + + Will use the provided parts as underlying storage - so do not mutate provided parts. + May merge small underlying parts for better access performance. + """ + + def __init__(self, *parts: Sequence[T]): + # Could build an index that makes random access faster? + self._parts: tuple[Sequence[T], ...] = tuple( + _defrag_parts(_flatten_parts(parts)) + ) + + def __getitem__(self, index): + if isinstance(index, slice): + return tuple(self)[index] + if index < 0: + index = len(self) + index + if index < 0: + raise IndexError("Index out of bounds") + + offset = 0 + for part in self._parts: + if (index - offset) < len(part): + return part[index - offset] + offset += len(part) + raise IndexError("Index out of bounds") + + @functools.cache + def __len__(self): + return sum(map(len, self._parts)) + + def __iter__(self): + for part in self._parts: + yield from part + + +def _flatten_parts(parts: Iterable[Sequence[T]]) -> Iterator[Sequence[T]]: + for part in parts: + if isinstance(part, ChainedSequence): + yield from part._parts + else: + yield part + + +# Should be a cache-friendly chunk size? +_TARGET_SIZE = 128 +_MAX_MERGABLE = 32 + + +def _defrag_parts(parts: Iterable[Sequence[T]]) -> Iterator[Sequence[T]]: + """ + Merge small chunks into larger chunks for better performance. + """ + parts_queue: list[Sequence[T]] = [] + queued_items = 0 + for part in parts: + # too big, just yield from the buffer + if len(part) > _MAX_MERGABLE: + yield from parts_queue + parts_queue = [] + queued_items = 0 + yield part + else: # can be merged, so lets add to the queue + parts_queue.append(part) + queued_items += len(part) + # if queue has reached target size, merge, dump and reset queue + if queued_items >= _TARGET_SIZE: + yield tuple(itertools.chain(*parts_queue)) + parts_queue = [] + queued_items = 0 + + yield from parts_queue diff --git a/tests/unit/test_sequences.py b/tests/unit/test_sequences.py new file mode 100644 index 0000000000..d901670b9b --- /dev/null +++ b/tests/unit/test_sequences.py @@ -0,0 +1,55 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import itertools +from typing import Sequence + +import pytest + +from bigframes.core import sequences + +LARGE_LIST = list(range(100, 500)) +SMALL_LIST = list(range(1, 5)) +CHAINED_LIST = sequences.ChainedSequence([SMALL_LIST for i in range(100)]) + + +def _build_reference(*parts): + return tuple(itertools.chain(*parts)) + + +def _check_equivalence(expected: Sequence, actual: Sequence): + assert len(expected) == len(actual) + assert tuple(expected) == tuple(actual) + assert expected[10:1:-2] == actual[10:1:-2] + if len(expected) > 0: + assert expected[len(expected) - 1] == expected[len(actual) - 1] + + +@pytest.mark.parametrize( + ("parts",), + [ + ([],), + ([[]],), + ([[0, 1, 2]],), + ([LARGE_LIST, SMALL_LIST, LARGE_LIST],), + ([SMALL_LIST * 100],), + ([CHAINED_LIST, LARGE_LIST, CHAINED_LIST, SMALL_LIST],), + ], +) +def test_init_chained_sequence_single_slist(parts): + value = sequences.ChainedSequence(*parts) + expected = _build_reference(*parts) + _check_equivalence(expected, value) From cd7fbde026522f53a23a4bb6585ad8629769fad1 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Thu, 24 Apr 2025 14:49:28 -0700 Subject: [PATCH 12/28] fix: resolve some of the typo errors (#1655) * fix: resolve the type in geo.py * fix --- bigframes/bigquery/_operations/geo.py | 2 +- bigframes/core/compile/googlesql/expression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 0856baabea..fc9bd1a653 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -264,7 +264,7 @@ def st_distance( A series containing geography objects. other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry): The series or geometric object to calculate the distance in meters - to from the geography objects in ``series``. + to form the geography objects in ``series``. use_spheroid (optional, default ``False``): Determines how this function measures distance. If ``use_spheroid`` is False, the function measures distance on the surface of a perfect diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py index 20d6dbf9a1..581ab67718 100644 --- a/bigframes/core/compile/googlesql/expression.py +++ b/bigframes/core/compile/googlesql/expression.py @@ -25,7 +25,7 @@ * `expression`: Models basic SQL expressions. Extended classes (not part of standard GoogleSQL syntax, but added for convenience): -i + * `ColumnExpression`: Represents column references. * `TableExpression`: Represents table references. * `AliasExpression`: Represents aliased expressions. From 6199023a6a71e72e926f5879e74a15215bc6e4a0 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 24 Apr 2025 15:14:28 -0700 Subject: [PATCH 13/28] feat: add GeminiTextGenerator.predict structured output (#1653) * feat: add GeminiTextGenerator.predict structured output * test * fix tests --- bigframes/ml/base.py | 22 +-- bigframes/ml/core.py | 31 +++- bigframes/ml/globals.py | 2 +- bigframes/ml/imported.py | 35 ++-- bigframes/ml/llm.py | 111 +++++------ bigframes/ml/remote.py | 21 +-- bigframes/ml/sql.py | 40 ++-- bigframes/ml/utils.py | 25 ++- tests/system/small/ml/test_llm.py | 296 ++++++++++++++++++------------ 9 files changed, 339 insertions(+), 244 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 2b25bc82f0..c36457d0b5 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,7 +22,7 @@ """ import abc -from typing import Callable, cast, Mapping, Optional, TypeVar, Union +from typing import cast, Optional, TypeVar, Union import warnings import bigframes_vendored.sklearn.base @@ -244,18 +244,12 @@ def fit( class RetriableRemotePredictor(BaseEstimator): - @property - @abc.abstractmethod - def _predict_func(self) -> Callable[[bpd.DataFrame, Mapping], bpd.DataFrame]: - pass - - @property - @abc.abstractmethod - def _status_col(self) -> str: - pass - def _predict_and_retry( - self, X: bpd.DataFrame, options: Mapping, max_retries: int + self, + bqml_model_predict_tvf: core.BqmlModel.TvfDef, + X: bpd.DataFrame, + options: dict, + max_retries: int, ) -> bpd.DataFrame: assert self._bqml_model is not None @@ -269,9 +263,9 @@ def _predict_and_retry( warnings.warn(msg, category=RuntimeWarning) break - df = self._predict_func(df_fail, options) + df = bqml_model_predict_tvf.tvf(self._bqml_model, df_fail, options) - success = df[self._status_col].str.len() == 0 + success = df[bqml_model_predict_tvf.status_col].str.len() == 0 df_succ = df[success] df_fail = df[~success] diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 81637333b0..e11f7d82ba 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -16,6 +16,7 @@ from __future__ import annotations +import dataclasses import datetime from typing import Callable, cast, Iterable, Mapping, Optional, Union import uuid @@ -44,6 +45,11 @@ class BqmlModel(BaseBqml): BigQuery DataFrames ML. """ + @dataclasses.dataclass + class TvfDef: + tvf: Callable[[BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + status_col: str + def __init__(self, session: bigframes.Session, model: bigquery.Model): self._session = session self._model = model @@ -159,8 +165,9 @@ def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: def generate_text( self, input_data: bpd.DataFrame, - options: Mapping[str, int | float], + options: dict[str, Union[int, float, bool]], ) -> bpd.DataFrame: + options["flatten_json_output"] = True return self._apply_ml_tvf( input_data, lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text( @@ -169,11 +176,14 @@ def generate_text( ), ) + generate_text_tvf = TvfDef(generate_text, "ml_generate_text_status") + def generate_embedding( self, input_data: bpd.DataFrame, - options: Mapping[str, int | float], + options: dict[str, Union[int, float, bool]], ) -> bpd.DataFrame: + options["flatten_json_output"] = True return self._apply_ml_tvf( input_data, lambda source_sql: self._model_manipulation_sql_generator.ml_generate_embedding( @@ -182,6 +192,23 @@ def generate_embedding( ), ) + generate_embedding_tvf = TvfDef(generate_embedding, "ml_generate_embedding_status") + + def generate_table( + self, + input_data: bpd.DataFrame, + options: dict[str, Union[int, float, bool, Mapping]], + ) -> bpd.DataFrame: + return self._apply_ml_tvf( + input_data, + lambda source_sql: self._model_manipulation_sql_generator.ai_generate_table( + source_sql=source_sql, + struct_options=options, + ), + ) + + generate_table_tvf = TvfDef(generate_table, "status") + def detect_anomalies( self, input_data: bpd.DataFrame, options: Mapping[str, int | float] ) -> bpd.DataFrame: diff --git a/bigframes/ml/globals.py b/bigframes/ml/globals.py index 44e9463727..62cfdbef72 100644 --- a/bigframes/ml/globals.py +++ b/bigframes/ml/globals.py @@ -19,7 +19,7 @@ _BASE_SQL_GENERATOR = sql.BaseSqlGenerator() _BQML_MODEL_FACTORY = core.BqmlModelFactory() -_SUPPORTED_DTYPES = ( +_REMOTE_MODEL_SUPPORTED_DTYPES = ( "bool", "string", "int64", diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 93152a6b99..a73ee352d0 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -216,8 +216,8 @@ def __init__( self, model_path: str, *, - input: Mapping[str, str] = {}, - output: Mapping[str, str] = {}, + input: Optional[Mapping[str, str]] = None, + output: Optional[Mapping[str, str]] = None, session: Optional[bigframes.session.Session] = None, ): self.session = session or bpd.get_global_session() @@ -234,20 +234,23 @@ def _create_bqml_model(self): return self._bqml_model_factory.create_imported_model( session=self.session, options=options ) - else: - for io in (self.input, self.output): - for v in io.values(): - if v not in globals._SUPPORTED_DTYPES: - raise ValueError( - f"field_type {v} is not supported. We only support {', '.join(globals._SUPPORTED_DTYPES)}." - ) - - return self._bqml_model_factory.create_xgboost_imported_model( - session=self.session, - input=self.input, - output=self.output, - options=options, - ) + if not self.input or not self.output: + raise ValueError("input and output must both or neigher be set.") + self.input = { + k: utils.standardize_type(v, globals._REMOTE_MODEL_SUPPORTED_DTYPES) + for k, v in self.input.items() + } + self.output = { + k: utils.standardize_type(v, globals._REMOTE_MODEL_SUPPORTED_DTYPES) + for k, v in self.output.items() + } + + return self._bqml_model_factory.create_xgboost_imported_model( + session=self.session, + input=self.input, + output=self.output, + options=options, + ) @classmethod def _from_bq( diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index cce05ea1f2..49dbfd3443 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union +from typing import cast, Iterable, Literal, Mapping, Optional, Union import warnings import bigframes_vendored.constants as constants @@ -92,10 +92,6 @@ _CLAUDE_3_OPUS_ENDPOINT, ) - -_ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" -_ML_GENERATE_EMBEDDING_STATUS = "ml_generate_embedding_status" - _MODEL_NOT_SUPPORTED_WARNING = ( "Model name '{model_name}' is not supported. " "We are currently aware of the following models: {known_models}. " @@ -193,18 +189,6 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - @property - def _predict_func( - self, - ) -> Callable[ - [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame - ]: - return self._bqml_model.generate_embedding - - @property - def _status_col(self) -> str: - return _ML_GENERATE_EMBEDDING_STATUS - def predict( self, X: utils.ArrayType, *, max_retries: int = 0 ) -> bigframes.dataframe.DataFrame: @@ -233,11 +217,14 @@ def predict( col_label = cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "content"}) - options = { - "flatten_json_output": True, - } + options: dict = {} - return self._predict_and_retry(X, options=options, max_retries=max_retries) + return self._predict_and_retry( + core.BqmlModel.generate_embedding_tvf, + X, + options=options, + max_retries=max_retries, + ) def to_gbq(self, model_name: str, replace: bool = False) -> TextEmbeddingGenerator: """Save the model to BigQuery. @@ -339,18 +326,6 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - @property - def _predict_func( - self, - ) -> Callable[ - [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame - ]: - return self._bqml_model.generate_embedding - - @property - def _status_col(self) -> str: - return _ML_GENERATE_EMBEDDING_STATUS - def predict( self, X: utils.ArrayType, *, max_retries: int = 0 ) -> bigframes.dataframe.DataFrame: @@ -384,11 +359,14 @@ def predict( if X["content"].dtype == dtypes.OBJ_REF_DTYPE: X["content"] = X["content"].blob._get_runtime("R", with_metadata=True) - options = { - "flatten_json_output": True, - } + options: dict = {} - return self._predict_and_retry(X, options=options, max_retries=max_retries) + return self._predict_and_retry( + core.BqmlModel.generate_embedding_tvf, + X, + options=options, + max_retries=max_retries, + ) def to_gbq( self, model_name: str, replace: bool = False @@ -533,18 +511,6 @@ def _bqml_options(self) -> dict: } return options - @property - def _predict_func( - self, - ) -> Callable[ - [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame - ]: - return self._bqml_model.generate_text - - @property - def _status_col(self) -> str: - return _ML_GENERATE_TEXT_STATUS - def fit( self, X: utils.ArrayType, @@ -596,6 +562,7 @@ def predict( ground_with_google_search: bool = False, max_retries: int = 0, prompt: Optional[Iterable[Union[str, bigframes.series.Series]]] = None, + output_schema: Optional[Mapping[str, str]] = None, ) -> bigframes.dataframe.DataFrame: """Predict the result from input DataFrame. @@ -645,6 +612,9 @@ def predict( Construct a prompt struct column for prediction based on the input. The input must be an Iterable that can take string literals, such as "summarize", string column(s) of X, such as X["str_col"], or blob column(s) of X, such as X["blob_col"]. It creates a struct column of the items of the iterable, and use the concatenated result as the input prompt. No-op if set to None. + output_schema (Mapping[str, str] or None, default None): + The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of :. + Supported types are int64, float64, bool and string. If None, output text result. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ @@ -707,16 +677,31 @@ def predict( col_label = cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "prompt"}) - options = { + options: dict = { "temperature": temperature, "max_output_tokens": max_output_tokens, - "top_k": top_k, + # "top_k": top_k, # TODO(garrettwu): the option is deprecated in Gemini 1.5 forward. "top_p": top_p, - "flatten_json_output": True, "ground_with_google_search": ground_with_google_search, } + if output_schema: + output_schema = { + k: utils.standardize_type(v) for k, v in output_schema.items() + } + options["output_schema"] = output_schema + return self._predict_and_retry( + core.BqmlModel.generate_table_tvf, + X, + options=options, + max_retries=max_retries, + ) - return self._predict_and_retry(X, options=options, max_retries=max_retries) + return self._predict_and_retry( + core.BqmlModel.generate_text_tvf, + X, + options=options, + max_retries=max_retries, + ) def score( self, @@ -916,18 +901,6 @@ def _bqml_options(self) -> dict: } return options - @property - def _predict_func( - self, - ) -> Callable[ - [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame - ]: - return self._bqml_model.generate_text - - @property - def _status_col(self) -> str: - return _ML_GENERATE_TEXT_STATUS - def predict( self, X: utils.ArrayType, @@ -1000,10 +973,14 @@ def predict( "max_output_tokens": max_output_tokens, "top_k": top_k, "top_p": top_p, - "flatten_json_output": True, } - return self._predict_and_retry(X, options=options, max_retries=max_retries) + return self._predict_and_retry( + core.BqmlModel.generate_text_tvf, + X, + options=options, + max_retries=max_retries, + ) def to_gbq(self, model_name: str, replace: bool = False) -> Claude3TextGenerator: """Save the model to BigQuery. diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index cc711cbe3b..b091c61f3f 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -78,19 +78,14 @@ def _create_bqml_model(self): "endpoint": self.endpoint, } - def standardize_type(v: str): - v = v.lower() - v = v.replace("boolean", "bool") - - if v not in globals._SUPPORTED_DTYPES: - raise ValueError( - f"Data type {v} is not supported. We only support {', '.join(globals._SUPPORTED_DTYPES)}." - ) - - return v - - self.input = {k: standardize_type(v) for k, v in self.input.items()} - self.output = {k: standardize_type(v) for k, v in self.output.items()} + self.input = { + k: utils.standardize_type(v, globals._REMOTE_MODEL_SUPPORTED_DTYPES) + for k, v in self.input.items() + } + self.output = { + k: utils.standardize_type(v, globals._REMOTE_MODEL_SUPPORTED_DTYPES) + for k, v in self.output.items() + } return self._bqml_model_factory.create_remote_model( session=self.session, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index a756fac3b9..84ea37c5fc 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -24,6 +24,8 @@ import bigframes.core.compile.googlesql as sql_utils import bigframes.core.sql as sql_vals +INDENT_STR = " " + # TODO: Add proper escaping logic from core/compile module class BaseSqlGenerator: @@ -44,35 +46,35 @@ def encode_value(self, v: Union[str, int, float, Iterable[str]]) -> str: def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" - indent_str = " " param_strs = [f"{k}={self.encode_value(v)}" for k, v in kwargs.items()] - return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + return "\n" + INDENT_STR + f",\n{INDENT_STR}".join(param_strs) - def build_structs(self, **kwargs: Union[int, float]) -> str: + def build_structs(self, **kwargs: Union[int, float, str, Mapping]) -> str: """Encode a dict of values into a formatted STRUCT items for SQL""" - indent_str = " " - param_strs = [ - f"{sql_vals.simple_literal(v)} AS {sql_utils.identifier(k)}" - for k, v in kwargs.items() - ] - return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + param_strs = [] + for k, v in kwargs.items(): + v_trans = self.build_schema(**v) if isinstance(v, Mapping) else v + + param_strs.append( + f"{sql_vals.simple_literal(v_trans)} AS {sql_utils.identifier(k)}" + ) + + return "\n" + INDENT_STR + f",\n{INDENT_STR}".join(param_strs) def build_expressions(self, *expr_sqls: str) -> str: """Encode a Iterable of SQL expressions into a formatted Iterable for SQL""" - indent_str = " " - return "\n" + indent_str + f",\n{indent_str}".join(expr_sqls) + return "\n" + INDENT_STR + f",\n{INDENT_STR}".join(expr_sqls) def build_schema(self, **kwargs: str) -> str: """Encode a dict of values into a formatted schema type items for SQL""" - indent_str = " " param_strs = [f"{sql_utils.identifier(k)} {v}" for k, v in kwargs.items()] - return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + return "\n" + INDENT_STR + f",\n{INDENT_STR}".join(param_strs) def options(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode the OPTIONS clause for BQML""" return f"OPTIONS({self.build_parameters(**kwargs)})" - def struct_options(self, **kwargs: Union[int, float]) -> str: + def struct_options(self, **kwargs: Union[int, float, Mapping]) -> str: """Encode a BQ STRUCT as options.""" return f"STRUCT({self.build_structs(**kwargs)})" @@ -406,3 +408,13 @@ def ml_transform(self, source_sql: str) -> str: """Encode ML.TRANSFORM for BQML""" return f"""SELECT * FROM ML.TRANSFORM(MODEL {self._model_ref_sql()}, ({source_sql}))""" + + def ai_generate_table( + self, + source_sql: str, + struct_options: Mapping[str, Union[int, float, bool, Mapping]], + ) -> str: + """Encode AI.GENERATE_TABLE for BQML""" + struct_options_sql = self.struct_options(**struct_options) + return f"""SELECT * FROM AI.GENERATE_TABLE(MODEL {self._model_ref_sql()}, + ({source_sql}), {struct_options_sql})""" diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index e034fd00f7..5c02789576 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,7 +13,17 @@ # limitations under the License. import typing -from typing import Any, Generator, Hashable, Literal, Mapping, Optional, Tuple, Union +from typing import ( + Any, + Generator, + Hashable, + Iterable, + Literal, + Mapping, + Optional, + Tuple, + Union, +) import bigframes_vendored.constants as constants from google.cloud import bigquery @@ -178,3 +188,16 @@ def combine_training_and_evaluation_data( bqml_options["data_split_col"] = split_col return X, y, bqml_options + + +def standardize_type(v: str, supported_dtypes: Optional[Iterable[str]] = None): + t = v.lower() + t = t.replace("boolean", "bool") + + if supported_dtypes: + if t not in supported_dtypes: + raise ValueError( + f"Data type {v} is not supported. We only support {', '.join(supported_dtypes)}." + ) + + return t diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 544889bf5a..90483667d8 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Callable from unittest import mock import pandas as pd @@ -222,6 +223,47 @@ def test_gemini_text_generator_multi_cols_predict_success( ) +@pytest.mark.parametrize( + "model_name", + ( + "gemini-1.5-pro-preview-0514", + "gemini-1.5-flash-preview-0514", + "gemini-1.5-pro-001", + "gemini-1.5-pro-002", + "gemini-1.5-flash-001", + "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", + ), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_generator_predict_output_schema_success( + llm_text_df: bpd.DataFrame, model_name, session, bq_connection +): + gemini_text_generator_model = llm.GeminiTextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + output_schema = { + "bool_output": "bool", + "int_output": "int64", + "float_output": "float64", + "str_output": "string", + } + df = gemini_text_generator_model.predict( + llm_text_df, output_schema=output_schema + ).to_pandas() + utils.check_pandas_df_schema_and_index( + df, + columns=list(output_schema.keys()) + ["prompt", "full_response", "status"], + index=3, + col_exact=False, + ) + + assert df["bool_output"].dtype == pd.BooleanDtype() + assert df["int_output"].dtype == pd.Int64Dtype() + assert df["float_output"].dtype == pd.Float64Dtype() + assert df["str_output"].dtype == pd.StringDtype(storage="pyarrow") + + # Overrides __eq__ function for comparing as mock.call parameter class EqCmpAllDataFrame(bpd.DataFrame): def __eq__(self, other): @@ -239,9 +281,7 @@ def __eq__(self, other): { "temperature": 0.9, "max_output_tokens": 8192, - "top_k": 40, "top_p": 1.0, - "flatten_json_output": True, "ground_with_google_search": False, }, ), @@ -251,7 +291,6 @@ def __eq__(self, other): "max_output_tokens": 128, "top_k": 40, "top_p": 0.95, - "flatten_json_output": True, }, ), ], @@ -297,11 +336,16 @@ def test_text_generator_retry_success( session=session, ) + mock_generate_text = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - + generate_text_tvf = core.BqmlModel.TvfDef( + mock_generate_text, "ml_generate_text_status" + ) # Responses. Retry twice then all succeeded. - mock_bqml_model.generate_text.side_effect = [ + mock_generate_text.side_effect = [ EqCmpAllDataFrame( { "ml_generate_text_status": ["", "error", "error"], @@ -344,32 +388,33 @@ def test_text_generator_retry_success( ) text_generator_model._bqml_model = mock_bqml_model - # 3rd retry isn't triggered - result = text_generator_model.predict(df0, max_retries=3) + with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): + # 3rd retry isn't triggered + result = text_generator_model.predict(df0, max_retries=3) - mock_bqml_model.generate_text.assert_has_calls( - [ - mock.call(df0, options), - mock.call(df1, options), - mock.call(df2, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_text_status": ["", "", ""], - "prompt": [ - "What is BigQuery?", - "What is BigQuery DataFrame?", - "What is BQML?", - ], - }, - index=[0, 2, 1], - ), - check_dtype=False, - check_index_type=False, - ) + mock_generate_text.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + mock.call(mock_bqml_model, df2, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_text_status": ["", "", ""], + "prompt": [ + "What is BigQuery?", + "What is BigQuery DataFrame?", + "What is BQML?", + ], + }, + index=[0, 2, 1], + ), + check_dtype=False, + check_index_type=False, + ) @pytest.mark.parametrize( @@ -383,9 +428,7 @@ def test_text_generator_retry_success( { "temperature": 0.9, "max_output_tokens": 8192, - "top_k": 40, "top_p": 1.0, - "flatten_json_output": True, "ground_with_google_search": False, }, ), @@ -395,7 +438,6 @@ def test_text_generator_retry_success( "max_output_tokens": 128, "top_k": 40, "top_p": 0.95, - "flatten_json_output": True, }, ), ], @@ -431,10 +473,16 @@ def test_text_generator_retry_no_progress( session=session, ) + mock_generate_text = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_text_tvf = core.BqmlModel.TvfDef( + mock_generate_text, "ml_generate_text_status" + ) # Responses. Retry once, no progress, just stop. - mock_bqml_model.generate_text.side_effect = [ + mock_generate_text.side_effect = [ EqCmpAllDataFrame( { "ml_generate_text_status": ["", "error", "error"], @@ -467,31 +515,32 @@ def test_text_generator_retry_no_progress( ) text_generator_model._bqml_model = mock_bqml_model - # No progress, only conduct retry once - result = text_generator_model.predict(df0, max_retries=3) + with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): + # No progress, only conduct retry once + result = text_generator_model.predict(df0, max_retries=3) - mock_bqml_model.generate_text.assert_has_calls( - [ - mock.call(df0, options), - mock.call(df1, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - ), - check_dtype=False, - check_index_type=False, - ) + mock_generate_text.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + ), + check_dtype=False, + check_index_type=False, + ) def test_text_embedding_generator_retry_success(session, bq_connection): @@ -529,11 +578,17 @@ def test_text_embedding_generator_retry_success(session, bq_connection): session=session, ) + mock_generate_embedding = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_embedding_tvf = core.BqmlModel.TvfDef( + mock_generate_embedding, "ml_generate_embedding_status" + ) # Responses. Retry twice then all succeeded. - mock_bqml_model.generate_embedding.side_effect = [ + mock_generate_embedding.side_effect = [ EqCmpAllDataFrame( { "ml_generate_embedding_status": ["", "error", "error"], @@ -568,41 +623,42 @@ def test_text_embedding_generator_retry_success(session, bq_connection): session=session, ), ] - options = { - "flatten_json_output": True, - } + options: dict = {} text_embedding_model = llm.TextEmbeddingGenerator( connection_name=bq_connection, session=session ) text_embedding_model._bqml_model = mock_bqml_model - # 3rd retry isn't triggered - result = text_embedding_model.predict(df0, max_retries=3) - - mock_bqml_model.generate_embedding.assert_has_calls( - [ - mock.call(df0, options), - mock.call(df1, options), - mock.call(df2, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_embedding_status": ["", "", ""], - "content": [ - "What is BigQuery?", - "What is BigQuery DataFrame?", - "What is BQML?", - ], - }, - index=[0, 2, 1], - ), - check_dtype=False, - check_index_type=False, - ) + with mock.patch.object( + core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf + ): + # 3rd retry isn't triggered + result = text_embedding_model.predict(df0, max_retries=3) + + mock_generate_embedding.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + mock.call(mock_bqml_model, df2, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_embedding_status": ["", "", ""], + "content": [ + "What is BigQuery?", + "What is BigQuery DataFrame?", + "What is BQML?", + ], + }, + index=[0, 2, 1], + ), + check_dtype=False, + check_index_type=False, + ) def test_text_embedding_generator_retry_no_progress(session, bq_connection): @@ -630,10 +686,17 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): session=session, ) + mock_generate_embedding = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_embedding_tvf = core.BqmlModel.TvfDef( + mock_generate_embedding, "ml_generate_embedding_status" + ) + # Responses. Retry once, no progress, just stop. - mock_bqml_model.generate_embedding.side_effect = [ + mock_generate_embedding.side_effect = [ EqCmpAllDataFrame( { "ml_generate_embedding_status": ["", "error", "error"], @@ -658,40 +721,41 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): session=session, ), ] - options = { - "flatten_json_output": True, - } + options: dict = {} text_embedding_model = llm.TextEmbeddingGenerator( connection_name=bq_connection, session=session ) text_embedding_model._bqml_model = mock_bqml_model - # No progress, only conduct retry once - result = text_embedding_model.predict(df0, max_retries=3) + with mock.patch.object( + core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf + ): + # No progress, only conduct retry once + result = text_embedding_model.predict(df0, max_retries=3) - mock_bqml_model.generate_embedding.assert_has_calls( - [ - mock.call(df0, options), - mock.call(df1, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - ), - check_dtype=False, - check_index_type=False, - ) + mock_generate_embedding.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + ), + check_dtype=False, + check_index_type=False, + ) @pytest.mark.flaky(retries=2) From 53caa8d689e64436f5313095ee27479a06d8e8a8 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 24 Apr 2025 16:21:12 -0700 Subject: [PATCH 14/28] docs: add sample code snippets for `udf` (#1649) * docs: add sample code snippets for `udf` * remove connection cleanup, not neede for udf * use bigframes project for doctest * restore python version agnostic logic for udf --- bigframes/functions/_function_session.py | 15 ++- samples/snippets/conftest.py | 11 +++ samples/snippets/remote_function.py | 4 +- samples/snippets/udf.py | 121 +++++++++++++++++++++++ samples/snippets/udf_test.py | 38 +++++++ 5 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 samples/snippets/udf.py create mode 100644 samples/snippets/udf_test.py diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 7fb5cc114b..e18f7084db 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -838,9 +838,18 @@ def wrapper(func): TypeError, f"func must be a callable, got {func}" ) - # Managed function supports version >= 3.11. - signature_kwargs: Mapping[str, Any] = {"eval_str": True} - signature = inspect.signature(func, **signature_kwargs) + if sys.version_info >= (3, 10): + # Add `eval_str = True` so that deferred annotations are turned into their + # corresponding type objects. Need Python 3.10 for eval_str parameter. + # https://docs.python.org/3/library/inspect.html#inspect.signature + signature_kwargs: Mapping[str, Any] = {"eval_str": True} + else: + signature_kwargs = {} # type: ignore + + signature = inspect.signature( + func, + **signature_kwargs, + ) # Try to get input types via type annotations. if input_types is None: diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 5cba045ce4..e8253bc5a7 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -24,6 +24,8 @@ "python-bigquery-dataframes", "samples/snippets" ) +routine_prefixer = test_utils.prefixer.Prefixer("bigframes", "") + @pytest.fixture(scope="session", autouse=True) def cleanup_datasets(bigquery_client: bigquery.Client) -> None: @@ -106,3 +108,12 @@ def random_model_id_eu( full_model_id = f"{project_id}.{dataset_id_eu}.{random_model_id_eu}" yield full_model_id bigquery_client.delete_model(full_model_id, not_found_ok=True) + + +@pytest.fixture +def routine_id() -> Iterator[str]: + """Create a new BQ routine ID each time, so random_routine_id can be used as + target for udf creation. + """ + random_routine_id = routine_prefixer.create_prefix() + yield random_routine_id diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index 3a7031ef89..4c5b365007 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -21,7 +21,7 @@ def run_remote_function_and_read_gbq_function(project_id: str) -> None: # Set BigQuery DataFrames options bpd.options.bigquery.project = your_gcp_project_id - bpd.options.bigquery.location = "us" + bpd.options.bigquery.location = "US" # BigQuery DataFrames gives you the ability to turn your custom scalar # functions into a BigQuery remote function. It requires the GCP project to @@ -56,7 +56,7 @@ def get_bucket(num: float) -> str: boundary = 4000 return "at_or_above_4000" if num >= boundary else "below_4000" - # Then we can apply the remote function on the `Series`` of interest via + # Then we can apply the remote function on the `Series` of interest via # `apply` API and store the result in a new column in the DataFrame. df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket)) diff --git a/samples/snippets/udf.py b/samples/snippets/udf.py new file mode 100644 index 0000000000..495cd33e84 --- /dev/null +++ b/samples/snippets/udf.py @@ -0,0 +1,121 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def run_udf_and_read_gbq_function( + project_id: str, dataset_id: str, routine_id: str +) -> None: + your_gcp_project_id = project_id + your_bq_dataset_id = dataset_id + your_bq_routine_id = routine_id + + # [START bigquery_dataframes_udf] + import bigframes.pandas as bpd + + # Set BigQuery DataFrames options + bpd.options.bigquery.project = your_gcp_project_id + bpd.options.bigquery.location = "US" + + # BigQuery DataFrames gives you the ability to turn your custom functions + # into a BigQuery Python UDF. One can find more details about the usage and + # the requirements via `help` command. + help(bpd.udf) + + # Read a table and inspect the column of interest. + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + df["body_mass_g"].peek(10) + + # Define a custom function, and specify the intent to turn it into a + # BigQuery Python UDF. Let's try a `pandas`-like use case in which we want + # to apply a user defined function to every value in a `Series`, more + # specifically bucketize the `body_mass_g` value of the penguins, which is a + # real number, into a category, which is a string. + @bpd.udf( + dataset=your_bq_dataset_id, + name=your_bq_routine_id, + ) + def get_bucket(num: float) -> str: + if not num: + return "NA" + boundary = 4000 + return "at_or_above_4000" if num >= boundary else "below_4000" + + # Then we can apply the udf on the `Series` of interest via + # `apply` API and store the result in a new column in the DataFrame. + df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket)) + + # This will add a new column `body_mass_bucket` in the DataFrame. You can + # preview the original value and the bucketized value side by side. + df[["body_mass_g", "body_mass_bucket"]].peek(10) + + # The above operation was possible by doing all the computation on the + # cloud through an underlying BigQuery Python UDF that was created to + # support the user's operations in the Python code. + + # The BigQuery Python UDF created to support the BigQuery DataFrames + # udf can be located via a property `bigframes_bigquery_function` + # set in the udf object. + print(f"Created BQ Python UDF: {get_bucket.bigframes_bigquery_function}") + + # If you have already defined a custom function in BigQuery, either via the + # BigQuery Google Cloud Console or with the `udf` decorator, + # or otherwise, you may use it with BigQuery DataFrames with the + # `read_gbq_function` method. More details are available via the `help` + # command. + help(bpd.read_gbq_function) + + existing_get_bucket_bq_udf = get_bucket.bigframes_bigquery_function + + # Here is an example of using `read_gbq_function` to load an existing + # BigQuery Python UDF. + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + get_bucket_function = bpd.read_gbq_function(existing_get_bucket_bq_udf) + + df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket_function)) + df.peek(10) + + # Let's continue trying other potential use cases of udf. Let's say we + # consider the `species`, `island` and `sex` of the penguins sensitive + # information and want to redact that by replacing with their hash code + # instead. Let's define another scalar custom function and decorate it + # as a udf. The custom function in this example has external package + # dependency, which can be specified via `packages` parameter. + @bpd.udf( + dataset=your_bq_dataset_id, + name=your_bq_routine_id, + packages=["cryptography"], + ) + def get_hash(input: str) -> str: + from cryptography.fernet import Fernet + + # handle missing value + if input is None: + input = "" + + key = Fernet.generate_key() + f = Fernet(key) + return f.encrypt(input.encode()).decode() + + # We can use this udf in another `pandas`-like API `map` that + # can be applied on a DataFrame + df_redacted = df[["species", "island", "sex"]].map(get_hash) + df_redacted.peek(10) + + # [END bigquery_dataframes_udf] + + # Clean up cloud artifacts + session = bpd.get_global_session() + session.bqclient.delete_routine( + f"{your_bq_dataset_id}.{your_bq_routine_id}", not_found_ok=True + ) diff --git a/samples/snippets/udf_test.py b/samples/snippets/udf_test.py new file mode 100644 index 0000000000..a352b4c8ce --- /dev/null +++ b/samples/snippets/udf_test.py @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas + +from . import udf + + +def test_udf_and_read_gbq_function( + capsys: pytest.CaptureFixture[str], + dataset_id: str, + routine_id: str, +) -> None: + # We need a fresh session since we're modifying connection options. + bigframes.pandas.close_session() + + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + your_project_id = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + + udf.run_udf_and_read_gbq_function(your_project_id, dataset_id, routine_id) + out, _ = capsys.readouterr() + assert "Created BQ Python UDF:" in out From 0895ef83ddffb62668ecda7bd08eacb98efea98b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 25 Apr 2025 13:20:44 -0700 Subject: [PATCH 15/28] refactor: add empty SQLGlotCompiler classes (#1652) * define configs module for CompileRequest and CompileResult * workaround AttributeError debugging issue * refactor: add empty SQLGlotCompiler --- bigframes/core/compile/api.py | 6 +- bigframes/core/compile/compiler.py | 22 +-- bigframes/core/compile/configs.py | 36 +++++ bigframes/core/compile/sqlglot/__init__.py | 5 + bigframes/core/compile/sqlglot/compiler.py | 141 ++++++++++++++++++ bigframes/core/compile/sqlglot/sql_gen.py | 38 +++++ bigframes/dataframe.py | 22 ++- .../core/compile/sqlglot/compiler_session.py | 76 ++++++++++ tests/unit/core/compile/sqlglot/conftest.py | 29 ++++ .../compile/sqlglot/test_compile_readlocal.py | 24 +++ 10 files changed, 373 insertions(+), 26 deletions(-) create mode 100644 bigframes/core/compile/configs.py create mode 100644 bigframes/core/compile/sqlglot/compiler.py create mode 100644 bigframes/core/compile/sqlglot/sql_gen.py create mode 100644 tests/unit/core/compile/sqlglot/compiler_session.py create mode 100644 tests/unit/core/compile/sqlglot/conftest.py create mode 100644 tests/unit/core/compile/sqlglot/test_compile_readlocal.py diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index bb19f92be9..287549b571 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -18,7 +18,7 @@ import google.cloud.bigquery as bigquery from bigframes.core import rewrite -from bigframes.core.compile import compiler +from bigframes.core.compile import compiler, configs if TYPE_CHECKING: import bigframes.core.nodes @@ -34,7 +34,7 @@ def compile( limit: Optional[int] = None, ) -> str: """Compile node into sql where rows are sorted with ORDER BY.""" - request = compiler.CompileRequest(node, sort_rows=ordered, peek_count=limit) + request = configs.CompileRequest(node, sort_rows=ordered, peek_count=limit) return compiler.compile_sql(request).sql def compile_raw( @@ -44,7 +44,7 @@ def compile_raw( str, Sequence[bigquery.SchemaField], bigframes.core.ordering.RowOrdering ]: """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" - request = compiler.CompileRequest( + request = configs.CompileRequest( node, sort_rows=False, materialize_all_order_keys=True ) result = compiler.compile_sql(request) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 8ca6cb35a3..a778889925 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -29,6 +29,7 @@ from bigframes.core import expression import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl +import bigframes.core.compile.configs as configs import bigframes.core.compile.explode import bigframes.core.compile.scalar_op_compiler as compile_scalar import bigframes.core.nodes as nodes @@ -39,22 +40,7 @@ import bigframes.core -@dataclasses.dataclass(frozen=True) -class CompileRequest: - node: nodes.BigFrameNode - sort_rows: bool - materialize_all_order_keys: bool = False - peek_count: typing.Optional[int] = None - - -@dataclasses.dataclass(frozen=True) -class CompileResult: - sql: str - sql_schema: typing.Sequence[google.cloud.bigquery.SchemaField] - row_order: Optional[bf_ordering.RowOrdering] - - -def compile_sql(request: CompileRequest) -> CompileResult: +def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: output_names = tuple((expression.DerefOp(id), id.sql) for id in request.node.ids) result_node = nodes.ResultNode( request.node, @@ -74,7 +60,7 @@ def compile_sql(request: CompileRequest) -> CompileResult: if request.sort_rows: result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) sql = compile_result_node(result_node) - return CompileResult( + return configs.CompileResult( sql, result_node.schema.to_bigquery(), result_node.order_by ) @@ -88,7 +74,7 @@ def compile_sql(request: CompileRequest) -> CompileResult: ordering if ordering.referenced_columns.issubset(result_node.ids) else None ) assert (not request.materialize_all_order_keys) or (output_order is not None) - return CompileResult(sql, result_node.schema.to_bigquery(), output_order) + return configs.CompileResult(sql, result_node.schema.to_bigquery(), output_order) def _replace_unsupported_ops(node: nodes.BigFrameNode): diff --git a/bigframes/core/compile/configs.py b/bigframes/core/compile/configs.py new file mode 100644 index 0000000000..5ffca0cf43 --- /dev/null +++ b/bigframes/core/compile/configs.py @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses +import typing + +import google.cloud.bigquery + +from bigframes.core import nodes, ordering + + +@dataclasses.dataclass(frozen=True) +class CompileRequest: + node: nodes.BigFrameNode + sort_rows: bool + materialize_all_order_keys: bool = False + peek_count: typing.Optional[int] = None + + +@dataclasses.dataclass(frozen=True) +class CompileResult: + sql: str + sql_schema: typing.Sequence[google.cloud.bigquery.SchemaField] + row_order: typing.Optional[ordering.RowOrdering] diff --git a/bigframes/core/compile/sqlglot/__init__.py b/bigframes/core/compile/sqlglot/__init__.py index 0a2669d7a2..2f40894975 100644 --- a/bigframes/core/compile/sqlglot/__init__.py +++ b/bigframes/core/compile/sqlglot/__init__.py @@ -11,3 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + +from bigframes.core.compile.sqlglot.compiler import SQLGlotCompiler + +__all__ = ["SQLGlotCompiler"] diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py new file mode 100644 index 0000000000..df800c5489 --- /dev/null +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -0,0 +1,141 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses +import functools +import typing + +import google.cloud.bigquery as bigquery +import sqlglot.expressions as sge + +from bigframes.core import expression, nodes, rewrite +from bigframes.core.compile import configs +from bigframes.core.compile.sqlglot import sql_gen +import bigframes.core.ordering as bf_ordering + + +@dataclasses.dataclass(frozen=True) +class SQLGlotCompiler: + """Compiles BigFrame nodes into SQL using SQLGlot.""" + + sql_gen = sql_gen.SQLGen() + + def compile( + self, + node: nodes.BigFrameNode, + *, + ordered: bool = True, + limit: typing.Optional[int] = None, + ) -> str: + """Compile node into sql where rows are sorted with ORDER BY.""" + request = configs.CompileRequest(node, sort_rows=ordered, peek_count=limit) + return self._compile_sql(request).sql + + def compile_raw( + self, + node: nodes.BigFrameNode, + ) -> typing.Tuple[ + str, typing.Sequence[bigquery.SchemaField], bf_ordering.RowOrdering + ]: + """Compile node into sql that exposes all columns, including hidden + ordering-only columns.""" + request = configs.CompileRequest( + node, sort_rows=False, materialize_all_order_keys=True + ) + result = self._compile_sql(request) + assert result.row_order is not None + return result.sql, result.sql_schema, result.row_order + + def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult: + output_names = tuple( + (expression.DerefOp(id), id.sql) for id in request.node.ids + ) + result_node = nodes.ResultNode( + request.node, + output_cols=output_names, + limit=request.peek_count, + ) + if request.sort_rows: + # Can only pullup slice if we are doing ORDER BY in outermost SELECT + # Need to do this before replacing unsupported ops, as that will rewrite slice ops + result_node = rewrite.pull_up_limits(result_node) + result_node = _replace_unsupported_ops(result_node) + # prune before pulling up order to avoid unnnecessary row_number() ops + result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) + result_node = rewrite.defer_order( + result_node, output_hidden_row_keys=request.materialize_all_order_keys + ) + if request.sort_rows: + result_node = typing.cast( + nodes.ResultNode, rewrite.column_pruning(result_node) + ) + sql = self._compile_result_node(result_node) + return configs.CompileResult( + sql, result_node.schema.to_bigquery(), result_node.order_by + ) + + ordering: typing.Optional[bf_ordering.RowOrdering] = result_node.order_by + result_node = dataclasses.replace(result_node, order_by=None) + result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) + sql = self._compile_result_node(result_node) + # Return the ordering iff no extra columns are needed to define the row order + if ordering is not None: + output_order = ( + ordering + if ordering.referenced_columns.issubset(result_node.ids) + else None + ) + assert (not request.materialize_all_order_keys) or (output_order is not None) + return configs.CompileResult( + sql, result_node.schema.to_bigquery(), output_order + ) + + def _compile_result_node(self, root: nodes.ResultNode) -> str: + sqlglot_expr = compile_node(root.child) + # TODO: add order_by, limit, and selections to sqlglot_expr + return self.sql_gen.sql(sqlglot_expr) + + +def _replace_unsupported_ops(node: nodes.BigFrameNode): + node = nodes.bottom_up(node, rewrite.rewrite_slice) + node = nodes.bottom_up(node, rewrite.rewrite_timedelta_expressions) + node = nodes.bottom_up(node, rewrite.rewrite_range_rolling) + return node + + +@functools.lru_cache(maxsize=5000) +def compile_node(node: nodes.BigFrameNode) -> sge.Expression: + """Compile node into CompileArrayValue. Caches result.""" + return node.reduce_up(lambda node, children: _compile_node(node, *children)) + + +@functools.singledispatch +def _compile_node( + node: nodes.BigFrameNode, *compiled_children: sge.Expression +) -> sge.Expression: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + +@_compile_node.register +def compile_readlocal(node: nodes.ReadLocalNode, *args) -> sge.Expression: + # TODO: add support for reading from local files + return sge.select() + + +@_compile_node.register +def compile_selection(node: nodes.SelectionNode, child: sge.Expression): + # TODO: add support for selection + return child diff --git a/bigframes/core/compile/sqlglot/sql_gen.py b/bigframes/core/compile/sqlglot/sql_gen.py new file mode 100644 index 0000000000..85784e3f5d --- /dev/null +++ b/bigframes/core/compile/sqlglot/sql_gen.py @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses + +import sqlglot.dialects.bigquery +import sqlglot.expressions as sge + + +@dataclasses.dataclass(frozen=True) +class SQLGen: + """Helper class to build SQLGlot Query and generate SQL string.""" + + dialect = sqlglot.dialects.bigquery.BigQuery + """The SQL dialect used for generation.""" + + quoted: bool = True + """Whether to quote identifiers in the generated SQL.""" + + pretty: bool = True + """Whether to pretty-print the generated SQL.""" + + def sql(self, expr: sge.Expression) -> str: + """Generate SQL string from the given expression.""" + return expr.sql(dialect=self.dialect, pretty=self.pretty) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 95ea487786..7ce59760b3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -419,11 +419,23 @@ def sql(self) -> str: str: string representing the compiled SQL. """ - include_index = self._has_index and ( - self.index.name is not None or len(self.index.names) > 1 - ) - sql, _, _ = self._to_sql_query(include_index=include_index) - return sql + try: + include_index = self._has_index and ( + self.index.name is not None or len(self.index.names) > 1 + ) + sql, _, _ = self._to_sql_query(include_index=include_index) + return sql + except AttributeError as e: + # Workaround for a development-mode debugging issue: + # An `AttributeError` originating *inside* this @property getter (e.g., due to + # a typo or referencing a non-existent attribute) can be mistakenly intercepted + # by the class's __getattr__ method if one is defined. + # We catch the AttributeError and raise SyntaxError instead to make it clear + # the error originates *here* in the property implementation. + # See: https://stackoverflow.com/questions/50542177/correct-handling-of-attributeerror-in-getattr-when-using-property + raise SyntaxError( + "AttributeError encountered. Please check the implementation for incorrect attribute access." + ) from e @property def query_job(self) -> Optional[bigquery.QueryJob]: diff --git a/tests/unit/core/compile/sqlglot/compiler_session.py b/tests/unit/core/compile/sqlglot/compiler_session.py new file mode 100644 index 0000000000..eddae8f891 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/compiler_session.py @@ -0,0 +1,76 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import typing +import weakref + +import bigframes.core +import bigframes.core.compile.sqlglot as sqlglot +import bigframes.dataframe +import bigframes.session.executor +import bigframes.session.metrics + + +@dataclasses.dataclass +class SQLCompilerExecutor(bigframes.session.executor.Executor): + """Executor for SQL compilation using sqlglot.""" + + compiler = sqlglot.SQLGlotCompiler() + + def to_sql( + self, + array_value: bigframes.core.ArrayValue, + offset_column: typing.Optional[str] = None, + ordered: bool = True, + enable_cache: bool = False, + ) -> str: + if offset_column: + array_value, _ = array_value.promote_offsets() + + # Compared with BigQueryCachingExecutor, SQLCompilerExecutor skips + # caching the subtree. + return self.compiler.compile(array_value.node, ordered=ordered) + + +class SQLCompilerSession(bigframes.session.Session): + """Session for SQL compilation using sqlglot.""" + + def __init__(self): + # TODO: remove unused attributes. + self._location = None # type: ignore + self._bq_kms_key_name = None # type: ignore + self._clients_provider = None # type: ignore + self.ibis_client = None # type: ignore + self._bq_connection = None # type: ignore + self._skip_bq_connection_check = True + self._objects: list[ + weakref.ReferenceType[ + typing.Union[ + bigframes.core.indexes.Index, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ] + ] + ] = [] + self._strictly_ordered: bool = True + self._allow_ambiguity = False # type: ignore + self._default_index_type = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + self._metrics = bigframes.session.metrics.ExecutionMetrics() + self._remote_function_session = None # type: ignore + self._temp_storage_manager = None # type: ignore + self._loader = None # type: ignore + + self._session_id: str = "sqlglot_unit_tests_session" + self._executor = SQLCompilerExecutor() diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py new file mode 100644 index 0000000000..c9087abe88 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + + +@pytest.fixture(scope="module") +def compiler_session(): + from . import compiler_session + + return compiler_session.SQLCompilerSession() + + +@pytest.fixture(scope="module") +def all_types_df() -> pd.DataFrame: + # TODO: all types pandas dataframes + return pd.DataFrame({}) diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py new file mode 100644 index 0000000000..2a69bb69b4 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +import bigframes +import bigframes.pandas as bpd + + +def test_compile_local(all_types_df: pd.DataFrame, compiler_session: bigframes.Session): + bf_df = bpd.DataFrame(all_types_df, session=compiler_session) + sql = bf_df.sql + assert sql == "SELECT" From c46ad0647785a9207359eba0fb5b6f7a16610f2a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 25 Apr 2025 13:26:25 -0700 Subject: [PATCH 16/28] feat: Support write api as loading option (#1617) --- bigframes/core/local_data.py | 78 +++++++++++++++++---- bigframes/core/utils.py | 3 + bigframes/session/__init__.py | 9 ++- bigframes/session/clients.py | 31 ++++++++ bigframes/session/loader.py | 71 +++++++++++++++++-- setup.py | 2 + tests/system/small/test_dataframe.py | 3 +- tests/system/small/test_session.py | 78 ++++++--------------- tests/unit/core/test_bf_utils.py | 2 +- third_party/bigframes_vendored/constants.py | 6 +- 10 files changed, 205 insertions(+), 78 deletions(-) diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index baa29ba72b..d387e0b818 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -97,27 +97,46 @@ def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable: mat.validate() return mat - def to_pyarrow_table( + def to_arrow( self, *, offsets_col: Optional[str] = None, geo_format: Literal["wkb", "wkt"] = "wkt", duration_type: Literal["int", "duration"] = "duration", json_type: Literal["string"] = "string", - ) -> pa.Table: - pa_table = self.data - if offsets_col is not None: - pa_table = pa_table.append_column( - offsets_col, pa.array(range(pa_table.num_rows), type=pa.int64()) - ) + ) -> tuple[pa.Schema, Iterable[pa.RecordBatch]]: if geo_format != "wkt": raise NotImplementedError(f"geo format {geo_format} not yet implemented") - if duration_type != "duration": - raise NotImplementedError( - f"duration as {duration_type} not yet implemented" - ) assert json_type == "string" - return pa_table + + batches = self.data.to_batches() + schema = self.data.schema + if duration_type == "int": + schema = _schema_durations_to_ints(schema) + batches = map(functools.partial(_cast_pa_batch, schema=schema), batches) + + if offsets_col is not None: + return schema.append(pa.field(offsets_col, pa.int64())), _append_offsets( + batches, offsets_col + ) + else: + return schema, batches + + def to_pyarrow_table( + self, + *, + offsets_col: Optional[str] = None, + geo_format: Literal["wkb", "wkt"] = "wkt", + duration_type: Literal["int", "duration"] = "duration", + json_type: Literal["string"] = "string", + ) -> pa.Table: + schema, batches = self.to_arrow( + offsets_col=offsets_col, + geo_format=geo_format, + duration_type=duration_type, + json_type=json_type, + ) + return pa.Table.from_batches(batches, schema) def to_parquet( self, @@ -391,6 +410,41 @@ def _physical_type_replacements(dtype: pa.DataType) -> pa.DataType: return dtype +def _append_offsets( + batches: Iterable[pa.RecordBatch], offsets_col_name: str +) -> Iterable[pa.RecordBatch]: + offset = 0 + for batch in batches: + offsets = pa.array(range(offset, offset + batch.num_rows), type=pa.int64()) + batch_w_offsets = pa.record_batch( + [*batch.columns, offsets], + schema=batch.schema.append(pa.field(offsets_col_name, pa.int64())), + ) + offset += batch.num_rows + yield batch_w_offsets + + +@_recursive_map_types +def _durations_to_ints(type: pa.DataType) -> pa.DataType: + if pa.types.is_duration(type): + return pa.int64() + return type + + +def _schema_durations_to_ints(schema: pa.Schema) -> pa.Schema: + return pa.schema( + pa.field(field.name, _durations_to_ints(field.type)) for field in schema + ) + + +# TODO: Use RecordBatch.cast once min pyarrow>=16.0 +def _cast_pa_batch(batch: pa.RecordBatch, schema: pa.Schema) -> pa.RecordBatch: + return pa.record_batch( + [arr.cast(type) for arr, type in zip(batch.columns, schema.types)], + schema=schema, + ) + + def _pairwise(iterable): do_yield = False a = None diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index ee09fc69cb..9731857ea0 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -142,6 +142,9 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str: identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier) if not identifier: identifier = "id" + elif identifier[0].isdigit(): + # first character must be letter or underscore + identifier = "_" + identifier return identifier diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b189c0e194..6379a6f2e8 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -255,6 +255,7 @@ def __init__( session=self, bqclient=self._clients_provider.bqclient, storage_manager=self._temp_storage_manager, + write_client=self._clients_provider.bqstoragewriteclient, default_index_type=self._default_index_type, scan_index_uniqueness=self._strictly_ordered, force_total_order=self._strictly_ordered, @@ -731,7 +732,9 @@ def read_pandas( workload is such that you exhaust the BigQuery load job quota and your data cannot be embedded in SQL due to size or data type limitations. - + * "bigquery_write": + [Preview] Use the BigQuery Storage Write API. This feature + is in public preview. Returns: An equivalent bigframes.pandas.(DataFrame/Series/Index) object @@ -805,6 +808,10 @@ def _read_pandas( return self._loader.read_pandas( pandas_dataframe, method="stream", api_name=api_name ) + elif write_engine == "bigquery_write": + return self._loader.read_pandas( + pandas_dataframe, method="write", api_name=api_name + ) else: raise ValueError(f"Got unexpected write_engine '{write_engine}'") diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 86be8bd897..5ef974d565 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -134,6 +134,9 @@ def __init__( self._bqstoragereadclient: Optional[ google.cloud.bigquery_storage_v1.BigQueryReadClient ] = None + self._bqstoragewriteclient: Optional[ + google.cloud.bigquery_storage_v1.BigQueryWriteClient + ] = None self._cloudfunctionsclient: Optional[ google.cloud.functions_v2.FunctionServiceClient ] = None @@ -238,6 +241,34 @@ def bqstoragereadclient(self): return self._bqstoragereadclient + @property + def bqstoragewriteclient(self): + if not self._bqstoragewriteclient: + bqstorage_options = None + if "bqstoragewriteclient" in self._client_endpoints_override: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override["bqstoragewriteclient"] + ) + elif self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) + ) + + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqstoragewriteclient = ( + google.cloud.bigquery_storage_v1.BigQueryWriteClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) + ) + + return self._bqstoragewriteclient + @property def cloudfunctionsclient(self): if not self._cloudfunctionsclient: diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index b053ed3c90..76f12ae438 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -23,6 +23,7 @@ import typing from typing import ( Dict, + Generator, Hashable, IO, Iterable, @@ -36,12 +37,13 @@ import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions +from google.cloud import bigquery_storage_v1 import google.cloud.bigquery as bigquery -import google.cloud.bigquery.table +from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas import pyarrow as pa -from bigframes.core import local_data, utils +from bigframes.core import guid, local_data, utils import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.schema as schemata @@ -142,6 +144,7 @@ def __init__( self, session: bigframes.session.Session, bqclient: bigquery.Client, + write_client: bigquery_storage_v1.BigQueryWriteClient, storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager, default_index_type: bigframes.enums.DefaultIndexKind, scan_index_uniqueness: bool, @@ -149,6 +152,7 @@ def __init__( metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, ): self._bqclient = bqclient + self._write_client = write_client self._storage_manager = storage_manager self._default_index_type = default_index_type self._scan_index_uniqueness = scan_index_uniqueness @@ -165,7 +169,7 @@ def __init__( def read_pandas( self, pandas_dataframe: pandas.DataFrame, - method: Literal["load", "stream"], + method: Literal["load", "stream", "write"], api_name: str, ) -> dataframe.DataFrame: # TODO: Push this into from_pandas, along with index flag @@ -183,6 +187,8 @@ def read_pandas( array_value = self.load_data(managed_data, api_name=api_name) elif method == "stream": array_value = self.stream_data(managed_data) + elif method == "write": + array_value = self.write_data(managed_data) else: raise ValueError(f"Unsupported read method {method}") @@ -198,7 +204,7 @@ def load_data( self, data: local_data.ManagedArrowTable, api_name: Optional[str] = None ) -> core.ArrayValue: """Load managed data into bigquery""" - ordering_col = "bf_load_job_offsets" + ordering_col = guid.generate_guid("load_offsets_") # JSON support incomplete for item in data.schema.items: @@ -244,7 +250,7 @@ def load_data( def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: """Load managed data into bigquery""" - ordering_col = "bf_stream_job_offsets" + ordering_col = guid.generate_guid("stream_offsets_") schema_w_offsets = data.schema.append( schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) ) @@ -277,6 +283,61 @@ def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: n_rows=data.data.num_rows, ).drop_columns([ordering_col]) + def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + """Load managed data into bigquery""" + ordering_col = guid.generate_guid("stream_offsets_") + schema_w_offsets = data.schema.append( + schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + ) + bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES) + bq_table_ref = self._storage_manager.create_temp_table( + bq_schema, [ordering_col] + ) + + requested_stream = bq_storage_types.stream.WriteStream() + requested_stream.type_ = bq_storage_types.stream.WriteStream.Type.COMMITTED # type: ignore + + stream_request = bq_storage_types.CreateWriteStreamRequest( + parent=bq_table_ref.to_bqstorage(), write_stream=requested_stream + ) + stream = self._write_client.create_write_stream(request=stream_request) + + def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]: + schema, batches = data.to_arrow( + offsets_col=ordering_col, duration_type="int" + ) + offset = 0 + for batch in batches: + request = bq_storage_types.AppendRowsRequest( + write_stream=stream.name, offset=offset + ) + request.arrow_rows.writer_schema.serialized_schema = ( + schema.serialize().to_pybytes() + ) + request.arrow_rows.rows.serialized_record_batch = ( + batch.serialize().to_pybytes() + ) + offset += batch.num_rows + yield request + + for response in self._write_client.append_rows(requests=request_gen()): + if response.row_errors: + raise ValueError( + f"Problem loading at least one row from DataFrame: {response.row_errors}. {constants.FEEDBACK_LINK}" + ) + # This step isn't strictly necessary in COMMITTED mode, but avoids max active stream limits + response = self._write_client.finalize_write_stream(name=stream.name) + assert response.row_count == data.data.num_rows + + destination_table = self._bqclient.get_table(bq_table_ref) + return core.ArrayValue.from_table( + table=destination_table, + schema=schema_w_offsets, + session=self._session, + offsets_col=ordering_col, + n_rows=data.data.num_rows, + ).drop_columns([ordering_col]) + def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: formatting_helpers.wait_for_job( diff --git a/setup.py b/setup.py index edc77e11b6..532ec07511 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,8 @@ "google-cloud-bigtable >=2.24.0", "google-cloud-pubsub >=2.21.4", "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", + # 2.30 needed for arrow support. + "google-cloud-bigquery-storage >= 2.30.0, < 3.0.0", "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-iam >=2.12.1", diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 83901562d3..452fa841e4 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -83,6 +83,7 @@ def test_df_construct_pandas_default(scalars_dfs): ("bigquery_inline"), ("bigquery_load"), ("bigquery_streaming"), + ("bigquery_write"), ], ) def test_read_pandas_all_nice_types( @@ -1772,7 +1773,7 @@ def test_len(scalars_dfs): ) @pytest.mark.parametrize( "write_engine", - ["bigquery_load", "bigquery_streaming"], + ["bigquery_load", "bigquery_streaming", "bigquery_write"], ) def test_df_len_local(session, n_rows, write_engine): assert ( diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 668c764645..1b886fbc5a 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -38,6 +38,17 @@ import bigframes.ml.linear_model from tests.system import utils +all_write_engines = pytest.mark.parametrize( + "write_engine", + [ + "default", + "bigquery_inline", + "bigquery_load", + "bigquery_streaming", + "bigquery_write", + ], +) + @pytest.fixture(scope="module") def df_and_local_csv(scalars_df_index): @@ -865,10 +876,7 @@ def test_read_pandas_tokyo( assert len(expected) == result.total_rows -@pytest.mark.parametrize( - "write_engine", - ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], -) +@all_write_engines def test_read_pandas_timedelta_dataframes(session, write_engine): pytest.importorskip( "pandas", @@ -886,10 +894,7 @@ def test_read_pandas_timedelta_dataframes(session, write_engine): pd.testing.assert_frame_equal(actual_result, expected_result) -@pytest.mark.parametrize( - "write_engine", - ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], -) +@all_write_engines def test_read_pandas_timedelta_series(session, write_engine): expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")) @@ -904,10 +909,7 @@ def test_read_pandas_timedelta_series(session, write_engine): ) -@pytest.mark.parametrize( - "write_engine", - ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], -) +@all_write_engines def test_read_pandas_timedelta_index(session, write_engine): expected_index = pd.to_timedelta( [1, 2, 3], unit="d" @@ -922,15 +924,7 @@ def test_read_pandas_timedelta_index(session, write_engine): pd.testing.assert_index_equal(actual_result, expected_index) -@pytest.mark.parametrize( - ("write_engine"), - [ - pytest.param("default"), - pytest.param("bigquery_load"), - pytest.param("bigquery_streaming"), - pytest.param("bigquery_inline"), - ], -) +@all_write_engines def test_read_pandas_json_dataframes(session, write_engine): json_data = [ "1", @@ -949,15 +943,7 @@ def test_read_pandas_json_dataframes(session, write_engine): pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) -@pytest.mark.parametrize( - ("write_engine"), - [ - pytest.param("default"), - pytest.param("bigquery_load"), - pytest.param("bigquery_streaming"), - pytest.param("bigquery_inline"), - ], -) +@all_write_engines def test_read_pandas_json_series(session, write_engine): json_data = [ "1", @@ -975,15 +961,7 @@ def test_read_pandas_json_series(session, write_engine): ) -@pytest.mark.parametrize( - ("write_engine"), - [ - pytest.param("default"), - pytest.param("bigquery_inline"), - pytest.param("bigquery_load"), - pytest.param("bigquery_streaming"), - ], -) +@all_write_engines def test_read_pandas_json_series_w_invalid_json(session, write_engine): json_data = [ "False", # Should be "false" @@ -994,15 +972,7 @@ def test_read_pandas_json_series_w_invalid_json(session, write_engine): session.read_pandas(pd_s, write_engine=write_engine) -@pytest.mark.parametrize( - ("write_engine"), - [ - pytest.param("default"), - pytest.param("bigquery_load"), - pytest.param("bigquery_streaming"), - pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)), - ], -) +@all_write_engines def test_read_pandas_json_index(session, write_engine): json_data = [ "1", @@ -1049,6 +1019,7 @@ def test_read_pandas_w_nested_json_fails(session, write_engine): pytest.param("default"), pytest.param("bigquery_inline"), pytest.param("bigquery_streaming"), + pytest.param("bigquery_write"), ], ) def test_read_pandas_w_nested_json(session, write_engine): @@ -1134,6 +1105,7 @@ def test_read_pandas_w_nested_json_index_fails(session, write_engine): pytest.param("default"), pytest.param("bigquery_inline"), pytest.param("bigquery_streaming"), + pytest.param("bigquery_write"), ], ) def test_read_pandas_w_nested_json_index(session, write_engine): @@ -1156,15 +1128,7 @@ def test_read_pandas_w_nested_json_index(session, write_engine): pd.testing.assert_index_equal(bq_idx, pd_idx) -@pytest.mark.parametrize( - ("write_engine",), - ( - ("default",), - ("bigquery_inline",), - ("bigquery_load",), - ("bigquery_streaming",), - ), -) +@all_write_engines def test_read_csv_for_gcs_file_w_write_engine(session, df_and_gcs_csv, write_engine): scalars_df, path = df_and_gcs_csv diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index cb3b03d988..9b4c4f8742 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -46,7 +46,7 @@ def test_get_standardized_ids_indexes(): assert col_ids == ["duplicate_2"] assert idx_ids == [ "string", - "0", + "_0", utils.UNNAMED_INDEX_ID, "duplicate", "duplicate_1", diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index d11d8ba2cb..af87694cd5 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -47,6 +47,10 @@ ) WriteEngineType = Literal[ - "default", "bigquery_inline", "bigquery_load", "bigquery_streaming" + "default", + "bigquery_inline", + "bigquery_load", + "bigquery_streaming", + "bigquery_write", ] VALID_WRITE_ENGINES = typing.get_args(WriteEngineType) From 30a62372b2fd72deaf7dbc1dbce8b48f03b3041c Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 25 Apr 2025 14:17:43 -0700 Subject: [PATCH 17/28] chore: add blob display size options (#1657) --- bigframes/_config/experiment_options.py | 31 +++++++++++++++++++++++++ bigframes/dataframe.py | 20 +++++++++++++--- bigframes/operations/blob.py | 7 ++++-- samples/snippets/multimodal_test.py | 8 +++++-- 4 files changed, 59 insertions(+), 7 deletions(-) diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index bb3966839c..cfbcc04cfd 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional import warnings import bigframes.exceptions as bfe @@ -26,6 +27,9 @@ def __init__(self): self._semantic_operators: bool = False self._ai_operators: bool = False self._blob: bool = False + self._blob_display: bool = True + self._blob_display_width: Optional[int] = None + self._blob_display_height: Optional[int] = None @property def semantic_operators(self) -> bool: @@ -67,3 +71,30 @@ def blob(self, value: bool): ) warnings.warn(msg, category=bfe.PreviewWarning) self._blob = value + + @property + def blob_display(self) -> bool: + """Whether to display the blob content in notebook DataFrame preview. Default True.""" + return self._blob_display + + @blob_display.setter + def blob_display(self, value: bool): + self._blob_display = value + + @property + def blob_display_width(self) -> Optional[int]: + """Width in pixels that the blob constrained to.""" + return self._blob_display_width + + @blob_display_width.setter + def blob_display_width(self, value: Optional[int]): + self._blob_display_width = value + + @property + def blob_display_height(self) -> Optional[int]: + """Height in pixels that the blob constrained to.""" + return self._blob_display_height + + @blob_display_height.setter + def blob_display_height(self, value: Optional[int]): + self._blob_display_height = value diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7ce59760b3..101e7a1c5e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -765,7 +765,10 @@ def _repr_html_(self) -> str: return formatter.repr_query_job(self._compute_dry_run()) df = self.copy() - if bigframes.options.experiments.blob: + if ( + bigframes.options.experiments.blob + and bigframes.options.experiments.blob_display + ): blob_cols = [ col for col in df.columns @@ -788,7 +791,10 @@ def _repr_html_(self) -> str: with display_options.pandas_repr(opts): # Allows to preview images in the DataFrame. The implementation changes the string repr as well, that it doesn't truncate strings or escape html charaters such as "<" and ">". We may need to implement a full-fledged repr module to better support types not in pandas. - if bigframes.options.experiments.blob: + if ( + bigframes.options.experiments.blob + and bigframes.options.experiments.blob_display + ): def obj_ref_rt_to_html(obj_ref_rt) -> str: obj_ref_rt_json = json.loads(obj_ref_rt) @@ -799,8 +805,16 @@ def obj_ref_rt_to_html(obj_ref_rt) -> str: str, gcs_metadata.get("content_type", "") ) if content_type.startswith("image"): + size_str = "" + if bigframes.options.experiments.blob_display_width: + size_str = f' width="{bigframes.options.experiments.blob_display_width}"' + if bigframes.options.experiments.blob_display_height: + size_str = ( + size_str + + f' height="{bigframes.options.experiments.blob_display_height}"' + ) url = obj_ref_rt_json["access_urls"]["read_url"] - return f'' + return f'' return f'uri: {obj_ref_rt_json["objectref"]["uri"]}, authorizer: {obj_ref_rt_json["objectref"]["authorizer"]}' diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index f0e25852cb..2c6e5fca7f 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -241,9 +241,12 @@ def display( Args: n (int, default 3): number of sample blob objects to display. content_type (str, default ""): content type of the blob. If unset, use the blob metadata of the storage. Possible values are "image", "audio" and "video". - width (int or None, default None): width in pixels that the image/video are constrained to. If unset, use the image/video's original size or ratio. No-op for other content types. - height (int or None, default None): height in pixels that the image/video are constrained to. If unset, use the image/video's original size or ratio. No-op for other content types. + width (int or None, default None): width in pixels that the image/video are constrained to. If unset, use the global setting in bigframes.options.experiments.blob_display_width, otherwise image/video's original size or ratio is used. No-op for other content types. + height (int or None, default None): height in pixels that the image/video are constrained to. If unset, use the global setting in bigframes.options.experiments.blob_display_height, otherwise image/video's original size or ratio is used. No-op for other content types. """ + width = width or bigframes.options.experiments.blob_display_width + height = height or bigframes.options.experiments.blob_display_height + # col name doesn't matter here. Rename to avoid column name conflicts df = bigframes.series.Series(self._block).rename("blob_col").to_frame() diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 85e118d671..27a7998ff9 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -21,6 +21,8 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: # Flag to enable the feature bigframes.options.experiments.blob = True + # Flags to control preview image/video preview size + bigframes.options.experiments.blob_display_width = 300 import bigframes.pandas as bpd @@ -47,10 +49,12 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: df_image["size"] = df_image["image"].blob.size() df_image["updated"] = df_image["image"].blob.updated() df_image + # [END bigquery_dataframes_multimodal_dataframe_merge] + # [START bigquery_dataframes_multimodal_dataframe_filter] # Filter images and display, you can also display audio and video types. Use width/height parameters to constrain window sizes. - df_image[df_image["author"] == "alice"]["image"].blob.display(width=400) - # [END bigquery_dataframes_multimodal_dataframe_merge] + df_image[df_image["author"] == "alice"]["image"].blob.display() + # [END bigquery_dataframes_multimodal_dataframe_filter] # [START bigquery_dataframes_multimodal_dataframe_image_transform] df_image["blurred"] = df_image["image"].blob.image_blur( From ebb7a5e2b24fa57d6fe6a76d9b857ad44c67d194 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Mon, 28 Apr 2025 17:50:32 -0400 Subject: [PATCH 18/28] fix: resolve issue where pre-release versions of google-auth are installed (#1491) Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 532ec07511..d0ee4da405 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "fsspec >=2023.3.0", "gcsfs >=2023.3.0", "geopandas >=0.12.2", - "google-auth >=2.15.0,<3.0dev", + "google-auth >=2.15.0,<3.0", "google-cloud-bigtable >=2.24.0", "google-cloud-pubsub >=2.21.4", "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", From f442e7a07ff273ba3af74eeabafb62110b78f692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 28 Apr 2025 17:54:58 -0500 Subject: [PATCH 19/28] docs: fix `bq_dataframes_template` notebook to work if partial ordering mode is enabled (#1665) * docs: fix `bq_dataframes_template` notebook to work if partial ordering mode is enabled * use reset_index in plot, show output for generative ai --- .../bq_dataframes_template.ipynb | 549 ++++++++++-------- 1 file changed, 298 insertions(+), 251 deletions(-) diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index c79efb63e5..12847483ac 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": { "id": "oM1iC_MfAts1" }, @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "NPPMuw2PXGeo" }, @@ -199,7 +199,7 @@ "# Note: BigQuery DataFrames objects are by default fully ordered like Pandas.\n", "# If ordering is not important for you, you can uncomment the following\n", "# expression to run BigQuery DataFrames in partial ordering mode.\n", - "#bpd.options.bigquery.ordering_mode = \"partial\"\n", + "# bpd.options.bigquery.ordering_mode = \"partial\"\n", "\n", "# Note: By default BigQuery DataFrames emits out BigQuery job metadata via a\n", "# progress bar. But in this notebook let's disable the progress bar to keep the\n", @@ -249,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": { "id": "Vyex9BQI-BNa" }, @@ -271,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -306,53 +306,53 @@ " \n", " \n", " \n", - " 198\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 43.3\n", - " 13.4\n", - " 209.0\n", - " 4400.0\n", + " 0\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 36.6\n", + " 18.4\n", + " 184.0\n", + " 3475.0\n", " FEMALE\n", " \n", " \n", - " 235\n", + " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Torgersen\n", - " 35.1\n", - " 19.4\n", - " 193.0\n", - " 4200.0\n", + " Dream\n", + " 39.8\n", + " 19.1\n", + " 184.0\n", + " 4650.0\n", " MALE\n", " \n", " \n", - " 317\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", + " 2\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 45.4\n", - " 18.7\n", - " 188.0\n", - " 3525.0\n", - " FEMALE\n", + " 40.9\n", + " 18.9\n", + " 184.0\n", + " 3900.0\n", + " MALE\n", " \n", " \n", - " 117\n", + " 3\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 48.5\n", - " 17.5\n", - " 191.0\n", - " 3400.0\n", - " MALE\n", + " 46.5\n", + " 17.9\n", + " 192.0\n", + " 3500.0\n", + " FEMALE\n", " \n", " \n", - " 159\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", + " 4\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 45.6\n", - " 19.4\n", - " 194.0\n", - " 3525.0\n", + " 37.3\n", + " 16.8\n", + " 192.0\n", + " 3000.0\n", " FEMALE\n", " \n", " \n", @@ -360,22 +360,22 @@ "" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "198 Gentoo penguin (Pygoscelis papua) Biscoe 43.3 \n", - "235 Adelie Penguin (Pygoscelis adeliae) Torgersen 35.1 \n", - "317 Chinstrap penguin (Pygoscelis antarctica) Dream 45.4 \n", - "117 Chinstrap penguin (Pygoscelis antarctica) Dream 48.5 \n", - "159 Chinstrap penguin (Pygoscelis antarctica) Dream 45.6 \n", + " species island culmen_length_mm \\\n", + "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", + "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", + "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", + "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "198 13.4 209.0 4400.0 FEMALE \n", - "235 19.4 193.0 4200.0 MALE \n", - "317 18.7 188.0 3525.0 FEMALE \n", - "117 17.5 191.0 3400.0 MALE \n", - "159 19.4 194.0 3525.0 FEMALE " + " culmen_depth_mm flipper_length_mm body_mass_g sex \n", + "0 18.4 184.0 3475.0 FEMALE \n", + "1 19.1 184.0 4650.0 MALE \n", + "2 18.9 184.0 3900.0 MALE \n", + "3 17.9 192.0 3500.0 FEMALE \n", + "4 16.8 192.0 3000.0 FEMALE " ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -424,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": { "id": "YKwCW7Nsavap" }, @@ -433,7 +433,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "average_body_mass: 4201.754385964906\n" + "average_body_mass: 4201.754385964914\n" ] } ], @@ -453,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": { "id": "4PyKMR61-Mjy" }, @@ -514,7 +514,7 @@ "[3 rows x 1 columns]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -543,7 +543,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -559,21 +559,21 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "133 {'culmen_length_mm': None, 'culmen_depth_mm': ...\n", - "279 {'culmen_length_mm': 37.9, 'culmen_depth_mm': ...\n", - "34 {'culmen_length_mm': 37.8, 'culmen_depth_mm': ...\n", - "208 {'culmen_length_mm': 40.5, 'culmen_depth_mm': ...\n", - "96 {'culmen_length_mm': 37.7, 'culmen_depth_mm': ...\n", + "0 {'culmen_length_mm': 36.6, 'culmen_depth_mm': ...\n", + "1 {'culmen_length_mm': 39.8, 'culmen_depth_mm': ...\n", + "2 {'culmen_length_mm': 40.9, 'culmen_depth_mm': ...\n", + "3 {'culmen_length_mm': 46.5, 'culmen_depth_mm': ...\n", + "4 {'culmen_length_mm': 37.3, 'culmen_depth_mm': ...\n", "dtype: struct[pyarrow]" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -594,21 +594,21 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "133 \n", - "279 18.6\n", - "34 18.3\n", - "96 18.7\n", - "208 18.9\n", + "0 18.4\n", + "1 19.1\n", + "2 18.9\n", + "3 17.9\n", + "4 16.8\n", "dtype: Float64" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -639,22 +639,32 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:263: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n", + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:239: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" + ] + }, { "data": { "text/plain": [ "" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -664,12 +674,17 @@ } ], "source": [ - "df.plot(title=\"Numeric features\")" + "(\n", + " df\n", + " .sort_values(\"body_mass_g\")\n", + " .reset_index(drop=True)\n", + " .plot(title=\"Numeric features\", logy=True)\n", + ")" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -678,13 +693,13 @@ "" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -709,12 +724,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -747,7 +762,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -801,47 +816,67 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop any rows that has missing (NA) values\n", + "df = df.dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Part of preparing data for a machine learning task is splitting it into subsets for training and testing to ensure that the solution is not overfitting. By default, BQML will automatically manage splitting the data for you. However, BQML also supports manually splitting out your training data.\n", + "\n", + "Performing a manual data split can be done with `bigframes.ml.model_selection.train_test_split` like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:263: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n", + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:239: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "\n", - " X shape: (334, 6)\n", - " y shape: (334, 1)\n", + " df_train shape: (267, 7)\n", + " df_test shape: (67, 7)\n", "\n" ] } ], "source": [ - "# Drop any rows that has missing (NA) values\n", - "df = df.dropna()\n", + "from bigframes.ml.model_selection import train_test_split\n", "\n", - "# Isolate input features and output variable into DataFrames\n", - "X = df[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex', 'species']]\n", - "y = df[['body_mass_g']]\n", "\n", - "# Print the shapes of features and label\n", + "# This will split df into test and training sets, with 20% of the rows in the test set,\n", + "# and the rest in the training set\n", + "df_train, df_test = train_test_split(df, test_size=0.2)\n", + "\n", + "# Show the shape of the data after the split\n", "print(f\"\"\"\n", - " X shape: {X.shape}\n", - " y shape: {y.shape}\n", + " df_train shape: {df_train.shape}\n", + " df_test shape: {df_test.shape}\n", "\"\"\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Part of preparing data for a machine learning task is splitting it into subsets for training and testing to ensure that the solution is not overfitting. By default, BQML will automatically manage splitting the data for you. However, BQML also supports manually splitting out your training data.\n", - "\n", - "Performing a manual data split can be done with `bigframes.ml.model_selection.train_test_split` like so:" - ] - }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -850,7 +885,7 @@ "text": [ "\n", " X_train shape: (267, 6)\n", - " X_test shape: (67, 6)\n", + " X_test shape: (67, 7)\n", " y_train shape: (267, 1)\n", " y_test shape: (67, 1)\n", "\n" @@ -858,13 +893,31 @@ } ], "source": [ - "from bigframes.ml.model_selection import train_test_split\n", + "# Isolate input features and output variable into DataFrames\n", + "X_train = df_train[[\n", + " 'island',\n", + " 'culmen_length_mm',\n", + " 'culmen_depth_mm',\n", + " 'flipper_length_mm',\n", + " 'sex',\n", + " 'species',\n", + "]]\n", + "y_train = df_train[['body_mass_g']]\n", "\n", - "# This will split X and y into test and training sets, with 20% of the rows in the test set,\n", - "# and the rest in the training set\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "X_test = df_test[[\n", + " 'island',\n", + " 'culmen_length_mm',\n", + " 'culmen_depth_mm',\n", + " 'flipper_length_mm',\n", + " 'sex',\n", + " 'species',\n", + " # Include the actual body_mass_g so that we can compare with the predicted\n", + " # without a join.\n", + " 'body_mass_g'\n", + "]]\n", + "y_test = df_test[['body_mass_g']]\n", "\n", - "# Show the shape of the data after the split\n", + "# Print the shapes of features and label\n", "print(f\"\"\"\n", " X_train shape: {X_train.shape}\n", " X_test shape: {X_test.shape}\n", @@ -885,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -901,7 +954,7 @@ " ('linreg', LinearRegression(fit_intercept=False))])" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -939,7 +992,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -970,80 +1023,86 @@ " flipper_length_mm\n", " sex\n", " species\n", + " body_mass_g\n", " \n", " \n", " \n", " \n", - " 37\n", - " -18640.718256\n", - " Biscoe\n", - " 44.5\n", - " 15.7\n", - " 217.0\n", - " .\n", - " Gentoo penguin (Pygoscelis papua)\n", - " \n", - " \n", - " 245\n", - " 3109.962252\n", + " 0\n", + " 3444.135246\n", " Dream\n", - " 33.1\n", - " 16.1\n", - " 178.0\n", + " 42.2\n", + " 18.5\n", + " 180.0\n", " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", + " 3550.0\n", " \n", " \n", - " 267\n", - " 3372.443434\n", + " 1\n", + " 3735.564386\n", " Torgersen\n", - " 41.1\n", - " 17.6\n", - " 182.0\n", - " FEMALE\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " MALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", + " 3750.0\n", " \n", " \n", - " 280\n", - " 3341.376012\n", - " Torgersen\n", - " 36.6\n", - " 17.8\n", - " 185.0\n", - " FEMALE\n", + " 2\n", + " 3879.370094\n", + " Dream\n", + " 40.9\n", + " 18.9\n", + " 184.0\n", + " MALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", + " 3900.0\n", " \n", " \n", - " 40\n", - " 3310.178937\n", + " 3\n", + " 3787.401253\n", " Biscoe\n", - " 37.6\n", - " 17.0\n", + " 38.2\n", + " 18.1\n", " 185.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " 3950.0\n", + " \n", + " \n", + " 4\n", + " 3435.804331\n", + " Dream\n", + " 36.0\n", + " 18.5\n", + " 186.0\n", " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", + " 3100.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " predicted_body_mass_g island culmen_length_mm culmen_depth_mm \\\n", - "37 -18640.718256 Biscoe 44.5 15.7 \n", - "245 3109.962252 Dream 33.1 16.1 \n", - "267 3372.443434 Torgersen 41.1 17.6 \n", - "280 3341.376012 Torgersen 36.6 17.8 \n", - "40 3310.178937 Biscoe 37.6 17.0 \n", + " predicted_body_mass_g island culmen_length_mm culmen_depth_mm \\\n", + "0 3444.135246 Dream 42.2 18.5 \n", + "1 3735.564386 Torgersen 39.1 18.7 \n", + "2 3879.370094 Dream 40.9 18.9 \n", + "3 3787.401253 Biscoe 38.2 18.1 \n", + "4 3435.804331 Dream 36.0 18.5 \n", "\n", - " flipper_length_mm sex species \n", - "37 217.0 . Gentoo penguin (Pygoscelis papua) \n", - "245 178.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "267 182.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "280 185.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "40 185.0 FEMALE Adelie Penguin (Pygoscelis adeliae) " + " flipper_length_mm sex species body_mass_g \n", + "0 180.0 FEMALE Adelie Penguin (Pygoscelis adeliae) 3550.0 \n", + "1 181.0 MALE Adelie Penguin (Pygoscelis adeliae) 3750.0 \n", + "2 184.0 MALE Adelie Penguin (Pygoscelis adeliae) 3900.0 \n", + "3 185.0 MALE Adelie Penguin (Pygoscelis adeliae) 3950.0 \n", + "4 186.0 FEMALE Adelie Penguin (Pygoscelis adeliae) 3100.0 " ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1070,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1105,12 +1164,12 @@ " \n", " \n", " 0\n", - " 582.272638\n", - " 8337651.200465\n", - " 0.004989\n", - " 193.446297\n", - " -11.273389\n", - " -11.091156\n", + " 212.800303\n", + " 72655.272611\n", + " 0.004369\n", + " 144.426983\n", + " 0.877546\n", + " 0.877991\n", " \n", " \n", "\n", @@ -1118,22 +1177,22 @@ "[1 rows x 6 columns in total]" ], "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 582.272638 8337651.200465 0.004989 \n", + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + " 212.800303 72655.272611 0.004369 \n", "\n", - " median_absolute_error r2_score explained_variance \n", - "0 193.446297 -11.273389 -11.091156 \n", + " median_absolute_error r2_score explained_variance \n", + " 144.426983 0.877546 0.877991 \n", "\n", "[1 rows x 6 columns]" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pipeline.score(X_test, y_test)" + "pipeline.score(X_test.drop(columns=[\"body_mass_g\"]), y_test)" ] }, { @@ -1145,16 +1204,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "-11.273389374372979" + "np.float64(0.8775458183087934)" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1162,7 +1221,7 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "r2_score(y_test, y_pred[\"predicted_body_mass_g\"])" + "r2_score(y_pred['body_mass_g'], y_pred[\"predicted_body_mass_g\"])" ] }, { @@ -1187,7 +1246,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1241,17 +1300,17 @@ "[3 rows x 1 columns]" ] }, - "execution_count": 22, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# df = bpd.DataFrame(\n", - "# {\n", - "# \"prompt\": [\"What is BigQuery?\", \"What is BQML?\", \"What is BigQuery DataFrames?\"],\n", - "# })\n", - "# df" + "df = bpd.DataFrame(\n", + " {\n", + " \"prompt\": [\"What is BigQuery?\", \"What is BQML?\", \"What is BigQuery DataFrames?\"],\n", + " })\n", + "df" ] }, { @@ -1265,15 +1324,18 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shobs/code/bigframes/bigframes/core/__init__.py:114: PreviewWarning: Interpreting JSON column(s) as pyarrow.large_string. This behavior may change in future versions.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n", + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:263: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n" ] }, { @@ -1305,28 +1367,23 @@ " \n", " \n", " \n", - " 0\n", - " ## BigQuery: Your Data Warehouse in the Cloud\n", - "...\n", - " [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob...\n", + " 1\n", + " BQML stands for **BigQuery Machine Learning**....\n", + " <NA>\n", " \n", - " What is BigQuery?\n", + " What is BQML?\n", " \n", " \n", - " 1\n", - " ## BQML - BigQuery Machine Learning\n", - "\n", - "BQML stan...\n", - " [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob...\n", + " 0\n", + " BigQuery is a fully managed, serverless data w...\n", + " <NA>\n", " \n", - " What is BQML?\n", + " What is BigQuery?\n", " \n", " \n", " 2\n", - " ## BigQuery DataFrames\n", - "\n", - "BigQuery DataFrames is...\n", - " [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob...\n", + " BigQuery DataFrames are a Python library that ...\n", + " <NA>\n", " \n", " What is BigQuery DataFrames?\n", " \n", @@ -1337,29 +1394,24 @@ ], "text/plain": [ " ml_generate_text_llm_result \\\n", - "0 ## BigQuery: Your Data Warehouse in the Cloud\n", - "... \n", - "1 ## BQML - BigQuery Machine Learning\n", - "\n", - "BQML stan... \n", - "2 ## BigQuery DataFrames\n", + "1 BQML stands for **BigQuery Machine Learning**.... \n", + "0 BigQuery is a fully managed, serverless data w... \n", + "2 BigQuery DataFrames are a Python library that ... \n", "\n", - "BigQuery DataFrames is... \n", - "\n", - " ml_generate_text_rai_result ml_generate_text_status \\\n", - "0 [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob... \n", - "1 [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob... \n", - "2 [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob... \n", + " ml_generate_text_rai_result ml_generate_text_status \\\n", + "1 \n", + "0 \n", + "2 \n", "\n", " prompt \n", - "0 What is BigQuery? \n", "1 What is BQML? \n", + "0 What is BigQuery? \n", "2 What is BigQuery DataFrames? \n", "\n", "[3 rows x 4 columns]" ] }, - "execution_count": 23, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1382,49 +1434,44 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "## BigQuery DataFrames\n", - "\n", - "BigQuery DataFrames is a Python library that allows you to interact with BigQuery data using the familiar Pandas API. This means you can use all the powerful tools and methods from the Pandas library to explore, analyze, and transform your BigQuery data, without needing to learn a new language or API.\n", - "\n", - "Here are some of the key benefits of using BigQuery DataFrames:\n", - "\n", - "* **Ease of use:** If you're already familiar with Pandas, you can start using BigQuery DataFrames with minimal learning curve.\n", - "* **Speed and efficiency:** BigQuery DataFrames leverages the power of BigQuery to perform complex operations on large datasets efficiently.\n", - "* **Flexibility:** You can use BigQuery DataFrames for a wide range of tasks, including data exploration, analysis, cleaning, and transformation.\n", - "* **Integration with other tools:** BigQuery DataFrames integrates seamlessly with other Google Cloud tools like Colab and Vertex AI, allowing you to build end-to-end data analysis pipelines.\n", - "\n", - "Here are some of the key features of BigQuery DataFrames:\n", - "\n", - "* **Support for most Pandas operations:** You can use most of the DataFrame methods you're familiar with, such as `groupby`, `filter`, `sort_values`, and `apply`.\n", - "* **Automatic schema inference:** BigQuery DataFrames automatically infers the schema of your data, so you don't need to manually specify it.\n", - "* **Efficient handling of large datasets:** BigQuery DataFrames pushes computations to BigQuery, which allows you to work with large datasets without running out of memory.\n", - "* **Support for both public and private datasets:** You can use BigQuery DataFrames to access both public and private datasets stored in BigQuery.\n", - "\n", - "## Getting Started with BigQuery DataFrames\n", - "\n", - "Getting started with BigQuery DataFrames is easy. You just need to install the library and configure your authentication. Once you're set up, you can start using it to interact with your BigQuery data.\n", - "\n", - "Here are some resources to help you get started:\n", - "\n", - "* **Documentation:** https://cloud.google.com/bigquery/docs/reference/libraries/bigquery-dataframe\n", - "* **Quickstart:** https://cloud.google.com/bigquery/docs/reference/libraries/bigquery-dataframe-python-quickstart\n", - "* **Tutorials:** https://cloud.google.com/bigquery/docs/tutorials/bq-dataframe-pandas-tutorial\n", - "\n", - "## Conclusion\n", - "\n", - "BigQuery DataFrames is a powerful tool that can help you get the most out of your BigQuery data. If you're looking for a way to easily analyze and transform your BigQuery data using the familiar Pandas API, then BigQuery DataFrames is a great option.\n" - ] + "data": { + "text/markdown": [ + "BigQuery DataFrames are a Python library that provides a Pandas-like interface for interacting with BigQuery data. Instead of loading entire datasets into memory (which is impossible for very large BigQuery tables), BigQuery DataFrames allow you to work with BigQuery data in a way that feels familiar if you've used Pandas, but leverages BigQuery's processing power for efficiency. This means you can perform data analysis and manipulation on datasets that are too large for Pandas to handle directly.\n", + "\n", + "Key features and characteristics include:\n", + "\n", + "* **Lazy Evaluation:** BigQuery DataFrames don't load the entire dataset into memory. Operations are expressed as queries that are executed in BigQuery only when necessary (e.g., when you call `.to_dataframe()` to materialize a result, or when you explicitly trigger execution). This significantly reduces memory consumption.\n", + "\n", + "* **Pandas-like API:** The library aims for a familiar API similar to Pandas. You can use many of the same functions and methods you would use with Pandas DataFrames, such as filtering, selecting columns, aggregations, and joining.\n", + "\n", + "* **Integration with BigQuery:** The library seamlessly integrates with BigQuery. It allows you to read data from BigQuery tables and write data back to BigQuery.\n", + "\n", + "* **Scalability:** Because the processing happens in BigQuery, BigQuery DataFrames can scale to handle datasets of virtually any size. It's designed to efficiently process terabytes or even petabytes of data.\n", + "\n", + "* **Performance:** While providing a user-friendly interface, BigQuery DataFrames leverages BigQuery's optimized query engine for fast execution of operations.\n", + "\n", + "* **SQL integration:** While providing a Pythonic interface, you can easily incorporate SQL queries directly within the DataFrame operations providing flexibility and control over the data manipulation.\n", + "\n", + "\n", + "**In short:** BigQuery DataFrames provide a powerful and efficient way to work with large BigQuery datasets using a familiar Pandas-like syntax without the memory limitations of loading the entire dataset into local memory. They bridge the gap between the ease of use of Pandas and the scalability of BigQuery.\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# print(pred.loc[2][\"ml_generate_text_llm_result\"])" + "# import IPython.display\n", + "\n", + "# IPython.display.Markdown(pred.loc[2][\"ml_generate_text_llm_result\"])" ] }, { @@ -1443,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1491,7 +1538,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.6" } }, "nbformat": 4, From 3eadf75defd71627ea98c8073eee669a5da3c9f8 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 29 Apr 2025 09:33:41 -0700 Subject: [PATCH 20/28] refactor: add compile_readlocal for SQLGlotCompiler (#1663) This change is adding compile_readlocal for SQLGlotCompiler class and also introducing snapshot as dev dependency for unit tests. --- bigframes/core/compile/compiled.py | 8 +- bigframes/core/compile/compiler.py | 4 +- bigframes/core/compile/sqlglot/compiler.py | 59 +++++++--- bigframes/core/compile/sqlglot/sql_gen.py | 38 ------- bigframes/core/compile/sqlglot/sqlglot_ir.py | 103 ++++++++++++++++++ .../core/compile/sqlglot/sqlglot_types.py | 2 +- bigframes/core/nodes.py | 4 +- noxfile.py | 4 +- setup.py | 3 +- tests/unit/core/compile/sqlglot/conftest.py | 13 ++- .../test_compile_readlocal/out.sql | 3 + .../compile/sqlglot/test_compile_readlocal.py | 10 +- 12 files changed, 176 insertions(+), 75 deletions(-) delete mode 100644 bigframes/core/compile/sqlglot/sql_gen.py create mode 100644 bigframes/core/compile/sqlglot/sqlglot_ir.py create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index a79ad9fe55..314b54fc6d 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -288,13 +288,7 @@ def _reproject_to_table(self) -> UnorderedIR: def from_polars( cls, pa_table: pa.Table, schema: Sequence[bigquery.SchemaField] ) -> UnorderedIR: - # TODO: add offsets - """ - Builds an in-memory only (SQL only) expr from a pandas dataframe. - - Assumed that the dataframe has unique string column names and bigframes-suppported - dtypes. - """ + """Builds an in-memory only (SQL only) expr from a pyarrow table.""" import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes # derive the ibis schema from the original pandas schema diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index a778889925..21045ca348 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -22,7 +22,6 @@ import bigframes_vendored.ibis.expr.api as ibis_api import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types -import google.cloud.bigquery import pyarrow as pa from bigframes import dtypes, operations @@ -169,7 +168,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): pa_table = node.local_data_source.data bq_schema = node.schema.to_bigquery() - pa_table = pa_table.select(list(item.source_id for item in node.scan_list.items)) + pa_table = pa_table.select([item.source_id for item in node.scan_list.items]) pa_table = pa_table.rename_columns( {item.source_id: item.id.sql for item in node.scan_list.items} ) @@ -178,7 +177,6 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): pa_table = pa_table.append_column( offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) ) - bq_schema = (*bq_schema, google.cloud.bigquery.SchemaField(offsets, "INT64")) return compiled.UnorderedIR.from_polars(pa_table, bq_schema) diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index df800c5489..2b78a6e458 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -15,14 +15,15 @@ import dataclasses import functools +import itertools import typing -import google.cloud.bigquery as bigquery -import sqlglot.expressions as sge +from google.cloud import bigquery +import pyarrow as pa -from bigframes.core import expression, nodes, rewrite +from bigframes.core import expression, identifiers, nodes, rewrite from bigframes.core.compile import configs -from bigframes.core.compile.sqlglot import sql_gen +import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.core.ordering as bf_ordering @@ -30,8 +31,6 @@ class SQLGlotCompiler: """Compiles BigFrame nodes into SQL using SQLGlot.""" - sql_gen = sql_gen.SQLGen() - def compile( self, node: nodes.BigFrameNode, @@ -81,6 +80,7 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult result_node = typing.cast( nodes.ResultNode, rewrite.column_pruning(result_node) ) + result_node = _remap_variables(result_node) sql = self._compile_result_node(result_node) return configs.CompileResult( sql, result_node.schema.to_bigquery(), result_node.order_by @@ -89,6 +89,8 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult ordering: typing.Optional[bf_ordering.RowOrdering] = result_node.order_by result_node = dataclasses.replace(result_node, order_by=None) result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) + + result_node = _remap_variables(result_node) sql = self._compile_result_node(result_node) # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: @@ -103,9 +105,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult ) def _compile_result_node(self, root: nodes.ResultNode) -> str: - sqlglot_expr = compile_node(root.child) + sqlglot_ir = compile_node(root.child) # TODO: add order_by, limit, and selections to sqlglot_expr - return self.sql_gen.sql(sqlglot_expr) + return sqlglot_ir.sql def _replace_unsupported_ops(node: nodes.BigFrameNode): @@ -115,27 +117,52 @@ def _replace_unsupported_ops(node: nodes.BigFrameNode): return node +def _remap_variables(node: nodes.ResultNode) -> nodes.ResultNode: + """Remaps `ColumnId`s in the BFET of a `ResultNode` to produce deterministic UIDs.""" + + def anonymous_column_ids() -> typing.Generator[identifiers.ColumnId, None, None]: + for i in itertools.count(): + yield identifiers.ColumnId(name=f"bfcol_{i}") + + result_node, _ = rewrite.remap_variables(node, anonymous_column_ids()) + return typing.cast(nodes.ResultNode, result_node) + + @functools.lru_cache(maxsize=5000) -def compile_node(node: nodes.BigFrameNode) -> sge.Expression: - """Compile node into CompileArrayValue. Caches result.""" +def compile_node(node: nodes.BigFrameNode) -> ir.SQLGlotIR: + """Compiles node into CompileArrayValue. Caches result.""" return node.reduce_up(lambda node, children: _compile_node(node, *children)) @functools.singledispatch def _compile_node( - node: nodes.BigFrameNode, *compiled_children: sge.Expression -) -> sge.Expression: + node: nodes.BigFrameNode, *compiled_children: ir.SQLGlotIR +) -> ir.SQLGlotIR: """Defines transformation but isn't cached, always use compile_node instead""" raise ValueError(f"Can't compile unrecognized node: {node}") @_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode, *args) -> sge.Expression: - # TODO: add support for reading from local files - return sge.select() +def compile_readlocal(node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: + offsets = node.offsets_col.sql if node.offsets_col else None + schema_names = node.schema.names + schema_dtypes = node.schema.dtypes + + pa_table = node.local_data_source.data + pa_table = pa_table.select([item.source_id for item in node.scan_list.items]) + pa_table = pa_table.rename_columns( + {item.source_id: item.id.sql for item in node.scan_list.items} + ) + + if offsets: + pa_table = pa_table.append_column( + offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) + ) + + return ir.SQLGlotIR.from_pandas(pa_table.to_pandas(), schema_names, schema_dtypes) @_compile_node.register -def compile_selection(node: nodes.SelectionNode, child: sge.Expression): +def compile_selection(node: nodes.SelectionNode, child: ir.SQLGlotIR): # TODO: add support for selection return child diff --git a/bigframes/core/compile/sqlglot/sql_gen.py b/bigframes/core/compile/sqlglot/sql_gen.py deleted file mode 100644 index 85784e3f5d..0000000000 --- a/bigframes/core/compile/sqlglot/sql_gen.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses - -import sqlglot.dialects.bigquery -import sqlglot.expressions as sge - - -@dataclasses.dataclass(frozen=True) -class SQLGen: - """Helper class to build SQLGlot Query and generate SQL string.""" - - dialect = sqlglot.dialects.bigquery.BigQuery - """The SQL dialect used for generation.""" - - quoted: bool = True - """Whether to quote identifiers in the generated SQL.""" - - pretty: bool = True - """Whether to pretty-print the generated SQL.""" - - def sql(self, expr: sge.Expression) -> str: - """Generate SQL string from the given expression.""" - return expr.sql(dialect=self.dialect, pretty=self.pretty) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py new file mode 100644 index 0000000000..836e7f17ba --- /dev/null +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -0,0 +1,103 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +import typing + +import pandas as pd +import sqlglot as sg +import sqlglot.dialects.bigquery +import sqlglot.expressions as sge + +from bigframes import dtypes +import bigframes.core.compile.sqlglot.sqlglot_types as sgt + + +@dataclasses.dataclass(frozen=True) +class SQLGlotIR: + """Helper class to build SQLGlot Query and generate SQL string.""" + + expr: sge.Expression = sge.Expression() + """The SQLGlot expression representing the query.""" + + dialect = sqlglot.dialects.bigquery.BigQuery + """The SQL dialect used for generation.""" + + quoted: bool = True + """Whether to quote identifiers in the generated SQL.""" + + pretty: bool = True + """Whether to pretty-print the generated SQL.""" + + @property + def sql(self) -> str: + """Generate SQL string from the given expression.""" + return self.expr.sql(dialect=self.dialect, pretty=self.pretty) + + @classmethod + def from_pandas( + cls, + pd_df: pd.DataFrame, + schema_names: typing.Sequence[str], + schema_dtypes: typing.Sequence[dtypes.Dtype], + ) -> SQLGlotIR: + """Builds SQLGlot expression from pyarrow table.""" + dtype_expr = sge.DataType( + this=sge.DataType.Type.STRUCT, + expressions=[ + sge.ColumnDef( + this=sge.to_identifier(name, quoted=True), + kind=sgt.SQLGlotType.from_bigframes_dtype(dtype), + ) + for name, dtype in zip(schema_names, schema_dtypes) + ], + nested=True, + ) + data_expr = [ + sge.Tuple( + expressions=tuple( + _literal( + value=value, + dtype=sgt.SQLGlotType.from_bigframes_dtype(dtype), + ) + for value, dtype in zip(row, schema_dtypes) + ) + ) + for _, row in pd_df.iterrows() + ] + expr = sge.Unnest( + expressions=[ + sge.DataType( + this=sge.DataType.Type.ARRAY, + expressions=[dtype_expr], + nested=True, + values=data_expr, + ), + ], + ) + return cls(expr=sg.select(sge.Star()).from_(expr)) + + +def _literal(value: typing.Any, dtype: str) -> sge.Expression: + if value is None: + return _cast(sge.Null(), dtype) + + # TODO: handle other types like visit_DefaultLiteral + return sge.convert(value) + + +def _cast(arg, to) -> sge.Cast: + return sge.Cast(this=arg, to=to) diff --git a/bigframes/core/compile/sqlglot/sqlglot_types.py b/bigframes/core/compile/sqlglot/sqlglot_types.py index 06c78c1435..0cfeaae3e9 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_types.py +++ b/bigframes/core/compile/sqlglot/sqlglot_types.py @@ -32,7 +32,7 @@ def from_bigframes_dtype( bigframes_dtype: typing.Union[ bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[typing.Any] ], - ): + ) -> str: if bigframes_dtype == bigframes.dtypes.INT_DTYPE: return "INT64" elif bigframes_dtype == bigframes.dtypes.FLOAT_DTYPE: diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index f7327f2a7a..558e4a2aaf 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -1636,11 +1636,11 @@ def remap_vars( def remap_refs( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ResultNode: - output_names = tuple( + output_cols = tuple( (ref.remap_column_refs(mappings), name) for ref, name in self.output_cols ) order_by = self.order_by.remap_column_refs(mappings) if self.order_by else None - return dataclasses.replace(self, output_names=output_names, order_by=order_by) # type: ignore + return dataclasses.replace(self, output_cols=output_cols, order_by=order_by) # type: ignore @property def consumed_ids(self) -> COLUMN_SET: diff --git a/noxfile.py b/noxfile.py index b29cda7a51..3bdff699ed 100644 --- a/noxfile.py +++ b/noxfile.py @@ -76,7 +76,7 @@ ] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] -UNIT_TEST_EXTRAS: List[str] = [] +UNIT_TEST_EXTRAS: List[str] = ["tests"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { "3.12": ["polars", "scikit-learn"], } @@ -203,7 +203,7 @@ def install_unittest_dependencies(session, install_test_extra, *constraints): if install_test_extra and UNIT_TEST_EXTRAS_BY_PYTHON: extras = UNIT_TEST_EXTRAS_BY_PYTHON.get(session.python, []) - elif install_test_extra and UNIT_TEST_EXTRAS: + if install_test_extra and UNIT_TEST_EXTRAS: extras = UNIT_TEST_EXTRAS else: extras = [] diff --git a/setup.py b/setup.py index d0ee4da405..489d9aacd9 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,7 @@ ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. - "tests": [], + "tests": ["freezegun", "pytest-snapshot"], # used for local engine, which is only needed for unit tests at present. "polars": ["polars >= 1.7.0"], "scikit-learn": ["scikit-learn>=1.2.2"], @@ -82,7 +82,6 @@ "pre-commit", "nox", "google-cloud-testutils", - "freezegun", ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index c9087abe88..23456fc76c 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -26,4 +26,15 @@ def compiler_session(): @pytest.fixture(scope="module") def all_types_df() -> pd.DataFrame: # TODO: all types pandas dataframes - return pd.DataFrame({}) + # TODO: add tests for empty dataframes + df = pd.DataFrame( + { + "int1": pd.Series([1, 2, 3], dtype="Int64"), + "int2": pd.Series([-10, 20, 30], dtype="Int64"), + "bools": pd.Series([True, None, False], dtype="boolean"), + "strings": pd.Series(["b", "aa", "ccc"], dtype="string[pyarrow]"), + }, + ) + # add more complexity index. + df.index = df.index.astype("Int64") + return df diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql new file mode 100644 index 0000000000..5c22910422 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM UNNEST(ARRAY>[(1, -10, TRUE, 'b', 0), (2, 20, CAST(NULL AS BOOLEAN), 'aa', 1), (3, 30, FALSE, 'ccc', 2)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index 2a69bb69b4..ad7cc5785a 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -13,12 +13,16 @@ # limitations under the License. import pandas as pd +import pytest import bigframes import bigframes.pandas as bpd +pytest.importorskip("pytest_snapshot") -def test_compile_local(all_types_df: pd.DataFrame, compiler_session: bigframes.Session): + +def test_compile_readlocal( + all_types_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot +): bf_df = bpd.DataFrame(all_types_df, session=compiler_session) - sql = bf_df.sql - assert sql == "SELECT" + snapshot.assert_match(bf_df.sql, "out.sql") From c958dbea32b77cec9fddfc09e3b40d1da220a42c Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 29 Apr 2025 10:27:51 -0700 Subject: [PATCH 21/28] perf: Fold row count ops when known (#1656) --- bigframes/core/array_value.py | 12 +++- bigframes/core/blocks.py | 28 +++------ bigframes/core/compile/compiler.py | 9 +-- bigframes/core/compile/polars/compiler.py | 5 -- bigframes/core/nodes.py | 62 ++++--------------- bigframes/core/rewrite/__init__.py | 2 + bigframes/core/rewrite/fold_row_count.py | 44 ++++++++++++++ bigframes/core/rewrite/order.py | 5 -- bigframes/core/rewrite/pruning.py | 24 ++++++-- bigframes/session/bq_caching_executor.py | 72 ++++++++--------------- bigframes/session/executor.py | 29 +++++++-- bigframes/session/local_scan_executor.py | 4 +- bigframes/session/read_api_execution.py | 10 +++- tests/system/small/test_session.py | 1 + tests/unit/core/test_blocks.py | 2 - 15 files changed, 160 insertions(+), 149 deletions(-) create mode 100644 bigframes/core/rewrite/fold_row_count.py diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index eba63ad72e..41a6882cd7 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -204,7 +204,17 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" - return ArrayValue(nodes.RowCountNode(child=self.node)) + return ArrayValue( + nodes.AggregateNode( + child=self.node, + aggregations=( + ( + ex.NullaryAggregation(agg_ops.size_op), + ids.ColumnId(bigframes.core.guid.generate_guid()), + ), + ), + ) + ) # Operations def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c53f392417..cc3b70f8a8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -39,7 +39,6 @@ Optional, Sequence, Tuple, - TYPE_CHECKING, Union, ) import warnings @@ -68,13 +67,8 @@ import bigframes.core.window_spec as windows import bigframes.dtypes import bigframes.exceptions as bfe -import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops -import bigframes.session._io.pandas as io_pandas - -if TYPE_CHECKING: - import bigframes.session.executor # Type constraint for wherever column labels are used Label = typing.Hashable @@ -221,7 +215,7 @@ def shape(self) -> typing.Tuple[int, int]: except Exception: pass - row_count = self.session._executor.get_row_count(self.expr) + row_count = self.session._executor.execute(self.expr.row_count()).to_py_scalar() return (row_count, len(self.value_columns)) @property @@ -485,7 +479,7 @@ def to_arrow( *, ordered: bool = True, allow_large_results: Optional[bool] = None, - ) -> Tuple[pa.Table, bigquery.QueryJob]: + ) -> Tuple[pa.Table, Optional[bigquery.QueryJob]]: """Run query and download results as a pyarrow Table.""" execute_result = self.session._executor.execute( self.expr, ordered=ordered, use_explicit_destination=allow_large_results @@ -580,7 +574,7 @@ def try_peek( result = self.session._executor.peek( self.expr, n, use_explicit_destination=allow_large_results ) - df = io_pandas.arrow_to_pandas(result.to_arrow_table(), self.expr.schema) + df = result.to_pandas() self._copy_index_to_pandas(df) return df else: @@ -604,8 +598,7 @@ def to_pandas_batches( page_size=page_size, max_results=max_results, ) - for record_batch in execute_result.arrow_batches(): - df = io_pandas.arrow_to_pandas(record_batch, self.expr.schema) + for df in execute_result.to_pandas_batches(): self._copy_index_to_pandas(df) if squeeze: yield df.squeeze(axis=1) @@ -659,7 +652,7 @@ def _materialize_local( # TODO: Maybe materialize before downsampling # Some downsampling methods - if fraction < 1: + if fraction < 1 and (execute_result.total_rows is not None): if not sample_config.enable_downsampling: raise RuntimeError( f"The data size ({table_mb:.2f} MB) exceeds the maximum download limit of " @@ -690,9 +683,7 @@ def _materialize_local( MaterializationOptions(ordered=materialize_options.ordered) ) else: - total_rows = execute_result.total_rows - arrow = execute_result.to_arrow_table() - df = io_pandas.arrow_to_pandas(arrow, schema=self.expr.schema) + df = execute_result.to_pandas() self._copy_index_to_pandas(df) return df, execute_result.query_job @@ -1570,12 +1561,11 @@ def retrieve_repr_request_results( # head caches full underlying expression, so row_count will be free after head_result = self.session._executor.head(self.expr, max_results) - count = self.session._executor.get_row_count(self.expr) + row_count = self.session._executor.execute(self.expr.row_count()).to_py_scalar() - arrow = head_result.to_arrow_table() - df = io_pandas.arrow_to_pandas(arrow, schema=self.expr.schema) + df = head_result.to_pandas() self._copy_index_to_pandas(df) - return df, count, head_result.query_job + return df, row_count, head_result.query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: expr, result_id = self._expr.promote_offsets() diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 21045ca348..fb5399b7cb 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -169,9 +169,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): bq_schema = node.schema.to_bigquery() pa_table = pa_table.select([item.source_id for item in node.scan_list.items]) - pa_table = pa_table.rename_columns( - {item.source_id: item.id.sql for item in node.scan_list.items} - ) + pa_table = pa_table.rename_columns([item.id.sql for item in node.scan_list.items]) if offsets: pa_table = pa_table.append_column( @@ -254,11 +252,6 @@ def compile_concat(node: nodes.ConcatNode, *children: compiled.UnorderedIR): return concat_impl.concat_unordered(children, output_ids) -@_compile_node.register -def compile_rowcount(node: nodes.RowCountNode, child: compiled.UnorderedIR): - return child.row_count(name=node.col_id.sql) - - @_compile_node.register def compile_aggregate(node: nodes.AggregateNode, child: compiled.UnorderedIR): aggs = tuple((agg, id.sql) for agg, id in node.aggregations) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index baa19eb990..b2f018e80a 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -252,11 +252,6 @@ def compile_projection(self, node: nodes.ProjectionNode): ] return self.compile_node(node.child).with_columns(new_cols) - @compile_node.register - def compile_rowcount(self, node: nodes.RowCountNode): - df = cast(pl.LazyFrame, self.compile_node(node.child)) - return df.select(pl.len().alias(node.col_id.sql)) - @compile_node.register def compile_offsets(self, node: nodes.PromoteOffsetsNode): return self.compile_node(node.child).with_columns( diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 558e4a2aaf..0fbfe7bd37 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -1256,55 +1256,6 @@ def remap_refs( return dataclasses.replace(self, assignments=new_fields) -# TODO: Merge RowCount into Aggregate Node? -# Row count can be compute from table metadata sometimes, so it is a bit special. -@dataclasses.dataclass(frozen=True, eq=False) -class RowCountNode(UnaryNode): - col_id: identifiers.ColumnId = identifiers.ColumnId("count") - - @property - def row_preserving(self) -> bool: - return False - - @property - def non_local(self) -> bool: - return True - - @property - def fields(self) -> Sequence[Field]: - return (Field(self.col_id, bigframes.dtypes.INT_DTYPE, nullable=False),) - - @property - def variables_introduced(self) -> int: - return 1 - - @property - def defines_namespace(self) -> bool: - return True - - @property - def row_count(self) -> Optional[int]: - return 1 - - @property - def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: - return (self.col_id,) - - @property - def consumed_ids(self) -> COLUMN_SET: - return frozenset() - - def remap_vars( - self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] - ) -> RowCountNode: - return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - - def remap_refs( - self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] - ) -> RowCountNode: - return self - - @dataclasses.dataclass(frozen=True, eq=False) class AggregateNode(UnaryNode): aggregations: typing.Tuple[typing.Tuple[ex.Aggregation, identifiers.ColumnId], ...] @@ -1642,6 +1593,19 @@ def remap_refs( order_by = self.order_by.remap_column_refs(mappings) if self.order_by else None return dataclasses.replace(self, output_cols=output_cols, order_by=order_by) # type: ignore + @property + def fields(self) -> Sequence[Field]: + # Fields property here is for output schema, not to be consumed by a parent node. + input_fields_by_id = {field.id: field for field in self.child.fields} + return tuple( + Field( + identifiers.ColumnId(output), + input_fields_by_id[ref.id].dtype, + input_fields_by_id[ref.id].nullable, + ) + for ref, output in self.output_cols + ) + @property def consumed_ids(self) -> COLUMN_SET: out_refs = frozenset(ref.id for ref, _ in self.output_cols) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index e9a94069f5..069ebb9cdf 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from bigframes.core.rewrite.fold_row_count import fold_row_counts from bigframes.core.rewrite.identifiers import remap_variables from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection @@ -38,4 +39,5 @@ "try_reduce_to_table_scan", "bake_order", "try_reduce_to_local_scan", + "fold_row_counts", ] diff --git a/bigframes/core/rewrite/fold_row_count.py b/bigframes/core/rewrite/fold_row_count.py new file mode 100644 index 0000000000..583343d68a --- /dev/null +++ b/bigframes/core/rewrite/fold_row_count.py @@ -0,0 +1,44 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pyarrow as pa + +from bigframes import dtypes +from bigframes.core import local_data, nodes +from bigframes.operations import aggregations + + +def fold_row_counts(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if not isinstance(node, nodes.AggregateNode): + return node + if len(node.by_column_ids) > 0: + return node + if node.child.row_count is None: + return node + for agg, _ in node.aggregations: + if agg.op != aggregations.size_op: + return node + local_data_source = local_data.ManagedArrowTable.from_pyarrow( + pa.table({"count": pa.array([node.child.row_count], type=pa.int64())}) + ) + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(out_id, dtypes.INT_DTYPE, "count") + for _, out_id in node.aggregations + ) + ) + return nodes.ReadLocalNode( + local_data_source=local_data_source, scan_list=scan_list, session=node.session + ) diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 06deba996c..5b5fb10753 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -211,11 +211,6 @@ def pull_up_order_inner( ) new_order = child_order.remap_column_refs(new_select_node.get_id_mapping()) return new_select_node, new_order - elif isinstance(node, bigframes.core.nodes.RowCountNode): - child_result = remove_order(node.child) - return node.replace_child( - child_result - ), bigframes.core.ordering.TotalOrdering.from_primary_key([node.col_id]) elif isinstance(node, bigframes.core.nodes.AggregateNode): if node.has_ordered_ops: child_result, child_order = pull_up_order_inner(node.child) diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py index 61375cf352..1ecfb452ec 100644 --- a/bigframes/core/rewrite/pruning.py +++ b/bigframes/core/rewrite/pruning.py @@ -51,9 +51,17 @@ def prune_columns(node: nodes.BigFrameNode): if isinstance(node, nodes.SelectionNode): result = prune_selection_child(node) elif isinstance(node, nodes.ResultNode): - result = node.replace_child(prune_node(node.child, node.consumed_ids)) + result = node.replace_child( + prune_node( + node.child, node.consumed_ids or frozenset(list(node.child.ids)[0:1]) + ) + ) elif isinstance(node, nodes.AggregateNode): - result = node.replace_child(prune_node(node.child, node.consumed_ids)) + result = node.replace_child( + prune_node( + node.child, node.consumed_ids or frozenset(list(node.child.ids)[0:1]) + ) + ) elif isinstance(node, nodes.InNode): result = dataclasses.replace( node, @@ -71,7 +79,9 @@ def prune_selection_child( # Important to check this first if list(selection.ids) == list(child.ids): - return child + if (ref.ref.id == ref.id for ref in selection.input_output_pairs): + # selection is no-op so just remove it entirely + return child if isinstance(child, nodes.SelectionNode): return selection.remap_refs( @@ -96,6 +106,9 @@ def prune_selection_child( indices = [ list(child.ids).index(ref.id) for ref, _ in selection.input_output_pairs ] + if len(indices) == 0: + # pushing zero-column selection into concat messes up emitter for now, which doesn't like zero columns + return selection new_children = [] for concat_node in child.child_nodes: cc_ids = tuple(concat_node.ids) @@ -146,7 +159,10 @@ def prune_aggregate( node: nodes.AggregateNode, used_cols: AbstractSet[identifiers.ColumnId], ) -> nodes.AggregateNode: - pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols) + pruned_aggs = ( + tuple(agg for agg in node.aggregations if agg[1] in used_cols) + or node.aggregations[0:1] + ) return dataclasses.replace(node, aggregations=pruned_aggs) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 584b41452a..ec5795f9a8 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -27,6 +27,7 @@ import google.cloud.bigquery_storage_v1 import bigframes.core +from bigframes.core import rewrite import bigframes.core.compile import bigframes.core.guid import bigframes.core.nodes as nodes @@ -96,11 +97,7 @@ def to_sql( ) -> str: if offset_column: array_value, _ = array_value.promote_offsets() - node = ( - self.replace_cached_subtrees(array_value.node) - if enable_cache - else array_value.node - ) + node = self.logical_plan(array_value.node) if enable_cache else array_value.node return self.compiler.compile(node, ordered=ordered) def execute( @@ -118,7 +115,7 @@ def execute( if bigframes.options.compute.enable_multi_query_execution: self._simplify_with_caching(array_value) - plan = self.replace_cached_subtrees(array_value.node) + plan = self.logical_plan(array_value.node) # Use explicit destination to avoid 10GB limit of temporary table destination_table = ( self.storage_manager.create_temp_table( @@ -227,7 +224,7 @@ def peek( """ A 'peek' efficiently accesses a small number of rows in the dataframe. """ - plan = self.replace_cached_subtrees(array_value.node) + plan = self.logical_plan(array_value.node) if not tree_properties.can_fast_peek(plan): msg = bfe.format_message("Peeking this value cannot be done efficiently.") warnings.warn(msg) @@ -249,16 +246,14 @@ def peek( def head( self, array_value: bigframes.core.ArrayValue, n_rows: int ) -> executor.ExecuteResult: - - maybe_row_count = self._local_get_row_count(array_value) - if (maybe_row_count is not None) and (maybe_row_count <= n_rows): - return self.execute(array_value, ordered=True) + plan = self.logical_plan(array_value.node) + if (plan.row_count is not None) and (plan.row_count <= n_rows): + return self._execute_plan(plan, ordered=True) if not self.strictly_ordered and not array_value.node.explicitly_ordered: # No user-provided ordering, so just get any N rows, its faster! return self.peek(array_value, n_rows) - plan = self.replace_cached_subtrees(array_value.node) if not tree_properties.can_fast_head(plan): # If can't get head fast, we are going to need to execute the whole query # Will want to do this in a way such that the result is reusable, but the first @@ -266,26 +261,12 @@ def head( # This currently requires clustering on offsets. self._cache_with_offsets(array_value) # Get a new optimized plan after caching - plan = self.replace_cached_subtrees(array_value.node) + plan = self.logical_plan(array_value.node) assert tree_properties.can_fast_head(plan) head_plan = generate_head_plan(plan, n_rows) return self._execute_plan(head_plan, ordered=True) - def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: - # TODO: Fold row count node in and use local execution - count = self._local_get_row_count(array_value) - if count is not None: - return count - else: - row_count_plan = self.replace_cached_subtrees( - generate_row_count_plan(array_value.node) - ) - results = self._execute_plan(row_count_plan, ordered=True) - pa_table = next(results.arrow_batches()) - pa_array = pa_table.column(0) - return pa_array.tolist()[0] - def cached( self, array_value: bigframes.core.ArrayValue, @@ -303,14 +284,6 @@ def cached( else: self._cache_with_cluster_cols(array_value, cluster_cols=cluster_cols) - def _local_get_row_count( - self, array_value: bigframes.core.ArrayValue - ) -> Optional[int]: - # optimized plan has cache materializations which will have row count metadata - # that is more likely to be usable than original leaf nodes. - plan = self.replace_cached_subtrees(array_value.node) - return tree_properties.row_count(plan) - # Helpers def _run_execute_query( self, @@ -365,16 +338,25 @@ def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): # Once rewriting is available, will want to rewrite before # evaluating execution cost. return tree_properties.is_trivially_executable( - self.replace_cached_subtrees(array_value.node) + self.logical_plan(array_value.node) ) + def logical_plan(self, root: nodes.BigFrameNode) -> nodes.BigFrameNode: + """ + Apply universal logical simplifications that are helpful regardless of engine. + """ + plan = self.replace_cached_subtrees(root) + plan = rewrite.column_pruning(plan) + plan = plan.top_down(rewrite.fold_row_counts) + return plan + def _cache_with_cluster_cols( self, array_value: bigframes.core.ArrayValue, cluster_cols: Sequence[str] ): """Executes the query and uses the resulting table to rewrite future executions.""" sql, schema, ordering_info = self.compiler.compile_raw( - self.replace_cached_subtrees(array_value.node) + self.logical_plan(array_value.node) ) tmp_table = self._sql_as_cached_temp_table( sql, @@ -391,9 +373,7 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") w_offsets, offset_column = array_value.promote_offsets() - sql = self.compiler.compile( - self.replace_cached_subtrees(w_offsets.node), ordered=False - ) + sql = self.compiler.compile(self.logical_plan(w_offsets.node), ordered=False) tmp_table = self._sql_as_cached_temp_table( sql, @@ -429,8 +409,10 @@ def _simplify_with_caching(self, array_value: bigframes.core.ArrayValue): """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" # Apply existing caching first for _ in range(MAX_SUBTREE_FACTORINGS): - node_with_cache = self.replace_cached_subtrees(array_value.node) - if node_with_cache.planning_complexity < QUERY_COMPLEXITY_LIMIT: + if ( + self.logical_plan(array_value.node).planning_complexity + < QUERY_COMPLEXITY_LIMIT + ): return did_cache = self._cache_most_complex_subtree(array_value.node) @@ -486,7 +468,7 @@ def _validate_result_schema( ): actual_schema = _sanitize(tuple(bq_schema)) ibis_schema = bigframes.core.compile.test_only_ibis_inferred_schema( - self.replace_cached_subtrees(array_value.node) + self.logical_plan(array_value.node) ).to_bigquery() internal_schema = _sanitize(array_value.schema.to_bigquery()) if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: @@ -593,7 +575,3 @@ def _sanitize( def generate_head_plan(node: nodes.BigFrameNode, n: int): return nodes.SliceNode(node, start=None, stop=n) - - -def generate_row_count_plan(node: nodes.BigFrameNode): - return nodes.RowCountNode(node) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 496ff560dd..0ba4ee3c2d 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -16,14 +16,17 @@ import abc import dataclasses +import functools import itertools from typing import Callable, Iterator, Literal, Mapping, Optional, Sequence, Union from google.cloud import bigquery +import pandas as pd import pyarrow import bigframes.core import bigframes.core.schema +import bigframes.session._io.pandas as io_pandas @dataclasses.dataclass(frozen=True) @@ -49,6 +52,26 @@ def to_arrow_table(self) -> pyarrow.Table: else: return self.schema.to_pyarrow().empty_table() + def to_pandas(self) -> pd.DataFrame: + return io_pandas.arrow_to_pandas(self.to_arrow_table(), self.schema) + + def to_pandas_batches(self) -> Iterator[pd.DataFrame]: + yield from map( + functools.partial(io_pandas.arrow_to_pandas, schema=self.schema), + self.arrow_batches(), + ) + + def to_py_scalar(self): + columns = list(self.to_arrow_table().to_pydict().values()) + if len(columns) != 1: + raise ValueError( + f"Expected single column result, got {len(columns)} columns." + ) + column = columns[0] + if len(column) != 1: + raise ValueError(f"Expected single row result, got {len(column)} rows.") + return column[0] + class Executor(abc.ABC): """ @@ -75,7 +98,7 @@ def execute( use_explicit_destination: Optional[bool] = False, page_size: Optional[int] = None, max_results: Optional[int] = None, - ): + ) -> ExecuteResult: """ Execute the ArrayValue, storing the result to a temporary session-owned table. """ @@ -135,10 +158,6 @@ def head( """ raise NotImplementedError("head not implemented for this executor") - # TODO: This should be done through execute() - def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: - raise NotImplementedError("get_row_count not implemented for this executor") - def cached( self, array_value: bigframes.core.ArrayValue, diff --git a/bigframes/session/local_scan_executor.py b/bigframes/session/local_scan_executor.py index 3ed1b759c5..67e381ab8a 100644 --- a/bigframes/session/local_scan_executor.py +++ b/bigframes/session/local_scan_executor.py @@ -50,9 +50,7 @@ def iterator_supplier(): needed_cols.append(offsets_col) arrow_table = arrow_table.select(needed_cols) - arrow_table = arrow_table.rename_columns( - {item.source_id: item.id.sql for item in node.scan_list.items} - ) + arrow_table = arrow_table.rename_columns([id.sql for id in node.ids]) yield from arrow_table.to_batches() total_rows = node.row_count diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index 32095e41f4..ae1272e722 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -16,6 +16,7 @@ from typing import Any, Optional from google.cloud import bigquery_storage_v1 +import pyarrow as pa from bigframes.core import bigframe_node, rewrite from bigframes.session import executor, semi_executor @@ -89,7 +90,14 @@ def iterator_supplier(): session.streams[0].name, retry=None ) rowstream = reader.rows() - return map(lambda page: page.to_arrow(), rowstream.pages) + + def process_page(page): + pa_batch = page.to_arrow() + return pa.RecordBatch.from_arrays( + pa_batch.columns, names=[id.sql for id in node.ids] + ) + + return map(process_page, rowstream.pages) return executor.ExecuteResult( arrow_batches=iterator_supplier, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 1b886fbc5a..c7bf5b3f5e 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -871,6 +871,7 @@ def test_read_pandas_tokyo( result = session_tokyo._executor.execute( df._block.expr, use_explicit_destination=True ) + assert result.query_job is not None assert result.query_job.location == tokyo_location assert len(expected) == result.total_rows diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index b1b276bda3..7c06bedfd3 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -85,13 +85,11 @@ def test_block_from_local(data): # hard-coded the returned dimension of the session for that each of the test case contains 3 rows. mock_session._executor = mock_executor - mock_executor.get_row_count.return_value = 3 block = blocks.Block.from_local(pandas.DataFrame(data), mock_session) pandas.testing.assert_index_equal(block.column_labels, expected.columns) assert tuple(block.index.names) == tuple(expected.index.names) - assert block.shape == expected.shape def test_block_compute_dry_run__raises_error_when_sampling_is_enabled(): From 4fb54dfe448604a90fc1818cf18b1e77e1e7227b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 29 Apr 2025 10:37:08 -0700 Subject: [PATCH 22/28] feat: add gemini-2.0-flash-001 and gemini-2.0-flash-lite-001 to fine tune score endponts and multimodal endpoints (#1650) * add test, no code support yet * add gemini-2.0-xx to fine tune score endponts and multimodal endpoints * wait for bqml to support gemini-2.0-flash-lite-001 * remove unsupported GA feature * remove unsupported fine-tune endpoints * fix a failed test * remove features are not ready * fix failed test * revert a typo --- bigframes/ml/llm.py | 8 ++++++-- tests/system/small/ml/test_llm.py | 10 ++++++++++ tests/system/small/ml/test_multimodal_llm.py | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 49dbfd3443..3aecc34142 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -72,6 +72,8 @@ _GEMINI_FINE_TUNE_SCORE_ENDPOINTS = ( _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, + _GEMINI_2_FLASH_001_ENDPOINT, + _GEMINI_2_FLASH_LITE_001_ENDPOINT, ) _GEMINI_MULTIMODAL_ENDPOINTS = ( _GEMINI_1P5_PRO_001_ENDPOINT, @@ -79,6 +81,7 @@ _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, _GEMINI_2_FLASH_EXP_ENDPOINT, + _GEMINI_2_FLASH_001_ENDPOINT, ) _CLAUDE_3_SONNET_ENDPOINT = "claude-3-sonnet" @@ -712,7 +715,8 @@ def score( ] = "text_generation", ) -> bigframes.dataframe.DataFrame: """Calculate evaluation metrics of the model. Only support - "gemini-1.5-pro-002", and "gemini-1.5-flash-002". + "gemini-1.5-pro-002", "gemini-1.5-flash-002", + "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001". .. note:: @@ -746,7 +750,7 @@ def score( if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: raise NotImplementedError( - "score() only supports gemini-1.5-pro-002, and gemini-1.5-flash-2 model." + "score() only supports gemini-1.5-pro-002, gemini-1.5-flash-2, gemini-2.0-flash-001, and gemini-2.0-flash-lite-001 model." ) X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 90483667d8..51e9d8ad6a 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -152,6 +152,8 @@ def test_create_load_gemini_text_generator_model( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky(retries=2) @@ -177,6 +179,8 @@ def test_gemini_text_generator_predict_default_params_success( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky(retries=2) @@ -204,6 +208,8 @@ def test_gemini_text_generator_predict_with_params_success( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky(retries=2) @@ -764,6 +770,8 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): ( "gemini-1.5-pro-002", "gemini-1.5-flash-002", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name): @@ -792,6 +800,8 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name): ( "gemini-1.5-pro-002", "gemini-1.5-flash-002", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name): diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py index 51e6bcb2d5..7c07d9ead2 100644 --- a/tests/system/small/ml/test_multimodal_llm.py +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -47,6 +47,7 @@ def test_multimodal_embedding_generator_predict_default_params_success( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", ), ) @pytest.mark.flaky(retries=2) From 563f0cbdf4a18c3cd1bd2a4b52de823165638911 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 29 Apr 2025 14:26:26 -0700 Subject: [PATCH 23/28] feat: DataFrames.__getitem__ support for slice input (#1668) --- bigframes/dataframe.py | 3 +++ tests/system/small/test_dataframe.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 101e7a1c5e..9cb388329e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -576,6 +576,9 @@ def __getitem__( if isinstance(key, bigframes.series.Series): return self._getitem_bool_series(key) + if isinstance(key, slice): + return self.iloc[key] + if isinstance(key, typing.Hashable): return self._getitem_label(key) # Select a subset of columns or re-order columns. diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 452fa841e4..362d736aeb 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -221,6 +221,21 @@ def test_get_column_nonstring(scalars_dfs): assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + "row_slice", + [ + (slice(1, 7, 2)), + (slice(1, 7, None)), + (slice(None, -3, None)), + ], +) +def test_get_rows_with_slice(scalars_dfs, row_slice): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[row_slice].to_pandas() + pd_result = scalars_pandas_df[row_slice] + assert_pandas_df_equal(bf_result, pd_result) + + def test_hasattr(scalars_dfs): scalars_df, _ = scalars_dfs assert hasattr(scalars_df, "int64_col") From ae312dbed25da6da5e2817d5c9838654c2a1ad1c Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 29 Apr 2025 15:15:11 -0700 Subject: [PATCH 24/28] =?UTF-8?q?feat:=20session.bytes=5Fprocessed=5Fsum?= =?UTF-8?q?=20will=20be=20updated=20when=20allow=5Flarge=5Fre=E2=80=A6=20(?= =?UTF-8?q?#1669)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: session.bytes_processed_sum will be updated when allow_large_result=False * mypy fix * remove duplicate logic --- bigframes/session/__init__.py | 7 --- bigframes/session/_io/bigquery/__init__.py | 5 +-- bigframes/session/metrics.py | 51 +++++++++++----------- scripts/run_and_publish_benchmark.py | 43 ++++++++---------- 4 files changed, 45 insertions(+), 61 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6379a6f2e8..c38d124196 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -336,13 +336,6 @@ def _project(self): @property def bytes_processed_sum(self): """The sum of all bytes processed by bigquery jobs using this session.""" - msg = bfe.format_message( - "Queries executed with `allow_large_results=False` within the session will not " - "have their bytes processed counted in this sum. If you need precise " - "bytes processed information, query the `INFORMATION_SCHEMA` tables " - "to get relevant metrics.", - ) - warnings.warn(msg, UserWarning) return self._metrics.bytes_processed @property diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 4fdd836777..6df9424e3b 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -27,7 +27,6 @@ import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions import google.cloud.bigquery as bigquery -import google.cloud.bigquery.table from bigframes.core import log_adapter import bigframes.core.compile.googlesql as googlesql @@ -249,7 +248,7 @@ def start_query_with_client( max_results=max_results, ) if metrics is not None: - metrics.count_job_stats(query=sql) + metrics.count_job_stats(row_iterator=results_iterator) return results_iterator, None query_job = bq_client.query( @@ -278,7 +277,7 @@ def start_query_with_client( ) if metrics is not None: - metrics.count_job_stats(query_job) + metrics.count_job_stats(query_job=query_job) return results_iterator, query_job diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index b4e1458b21..055abd5899 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -20,6 +20,7 @@ import google.cloud.bigquery as bigquery import google.cloud.bigquery.job as bq_job +import google.cloud.bigquery.table as bq_table LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -33,14 +34,22 @@ class ExecutionMetrics: query_char_count: int = 0 def count_job_stats( - self, query_job: Optional[bq_job.QueryJob] = None, query: str = "" + self, + query_job: Optional[bq_job.QueryJob] = None, + row_iterator: Optional[bq_table.RowIterator] = None, ): if query_job is None: - query_char_count = len(query) + assert row_iterator is not None + if (row_iterator.total_bytes_processed is None) or ( + row_iterator.query is None + ): + return + query_char_count = len(row_iterator.query) + bytes_processed = row_iterator.total_bytes_processed self.execution_count += 1 self.query_char_count += query_char_count - if LOGGING_NAME_ENV_VAR in os.environ: - write_stats_to_disk(query_char_count) + self.bytes_processed += bytes_processed + write_stats_to_disk(query_char_count, bytes_processed) return stats = get_performance_stats(query_job) @@ -51,11 +60,9 @@ def count_job_stats( self.bytes_processed += bytes_processed self.slot_millis += slot_millis self.execution_secs += execution_secs - if LOGGING_NAME_ENV_VAR in os.environ: - # when running notebooks via pytest nbmake - write_stats_to_disk( - query_char_count, bytes_processed, slot_millis, execution_secs - ) + write_stats_to_disk( + query_char_count, bytes_processed, slot_millis, execution_secs + ) def get_performance_stats( @@ -88,7 +95,7 @@ def get_performance_stats( def write_stats_to_disk( query_char_count: int, - bytes_processed: Optional[int] = None, + bytes_processed: int, slot_millis: Optional[int] = None, exec_seconds: Optional[float] = None, ): @@ -96,24 +103,13 @@ def write_stats_to_disk( to a file in order to create a performance report. """ if LOGGING_NAME_ENV_VAR not in os.environ: - raise EnvironmentError( - "Environment variable {env_var} is not set".format( - env_var=LOGGING_NAME_ENV_VAR - ) - ) + return + + # when running notebooks via pytest nbmake and running benchmarks test_name = os.environ[LOGGING_NAME_ENV_VAR] current_directory = os.getcwd() - if ( - (bytes_processed is not None) - and (slot_millis is not None) - and (exec_seconds is not None) - ): - # store bytes processed - bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") - with open(bytes_file, "a") as f: - f.write(str(bytes_processed) + "\n") - + if (slot_millis is not None) and (exec_seconds is not None): # store slot milliseconds slot_file = os.path.join(current_directory, test_name + ".slotmillis") with open(slot_file, "a") as f: @@ -132,3 +128,8 @@ def write_stats_to_disk( ) with open(query_char_count_file, "a") as f: f.write(str(query_char_count) + "\n") + + # store bytes processed + bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") + with open(bytes_file, "a") as f: + f.write(str(bytes_processed) + "\n") diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py index 402ba4d213..0ea3a5e162 100644 --- a/scripts/run_and_publish_benchmark.py +++ b/scripts/run_and_publish_benchmark.py @@ -93,10 +93,10 @@ def collect_benchmark_result( error_files = sorted(path.rglob("*.error")) if not ( - len(bytes_files) - == len(millis_files) + len(millis_files) == len(bq_seconds_files) - <= len(query_char_count_files) + <= len(bytes_files) + == len(query_char_count_files) == len(local_seconds_files) ): raise ValueError( @@ -108,10 +108,13 @@ def collect_benchmark_result( for idx in range(len(local_seconds_files)): query_char_count_file = query_char_count_files[idx] local_seconds_file = local_seconds_files[idx] + bytes_file = bytes_files[idx] filename = query_char_count_file.relative_to(path).with_suffix("") - if filename != local_seconds_file.relative_to(path).with_suffix(""): + if filename != local_seconds_file.relative_to(path).with_suffix( + "" + ) or filename != bytes_file.relative_to(path).with_suffix(""): raise ValueError( - "File name mismatch between query_char_count and seconds reports." + "File name mismatch among query_char_count, bytes and seconds reports." ) with open(query_char_count_file, "r") as file: @@ -123,27 +126,23 @@ def collect_benchmark_result( lines = file.read().splitlines() local_seconds = sum(float(line) for line in lines) / iterations + with open(bytes_file, "r") as file: + lines = file.read().splitlines() + total_bytes = sum(int(line) for line in lines) / iterations + if not has_full_metrics: - total_bytes = None total_slot_millis = None bq_seconds = None else: - bytes_file = bytes_files[idx] millis_file = millis_files[idx] bq_seconds_file = bq_seconds_files[idx] - if ( - filename != bytes_file.relative_to(path).with_suffix("") - or filename != millis_file.relative_to(path).with_suffix("") - or filename != bq_seconds_file.relative_to(path).with_suffix("") - ): + if filename != millis_file.relative_to(path).with_suffix( + "" + ) or filename != bq_seconds_file.relative_to(path).with_suffix(""): raise ValueError( "File name mismatch among query_char_count, bytes, millis, and seconds reports." ) - with open(bytes_file, "r") as file: - lines = file.read().splitlines() - total_bytes = sum(int(line) for line in lines) / iterations - with open(millis_file, "r") as file: lines = file.read().splitlines() total_slot_millis = sum(int(line) for line in lines) / iterations @@ -202,11 +201,7 @@ def collect_benchmark_result( print( f"{index} - query count: {row['Query_Count']}," + f" query char count: {row['Query_Char_Count']}," - + ( - f" bytes processed sum: {row['Bytes_Processed']}," - if has_full_metrics - else "" - ) + + f" bytes processed sum: {row['Bytes_Processed']}," + (f" slot millis sum: {row['Slot_Millis']}," if has_full_metrics else "") + f" local execution time: {formatted_local_exec_time} seconds" + ( @@ -238,11 +233,7 @@ def collect_benchmark_result( print( f"---Geometric mean of queries: {geometric_mean_queries}," + f" Geometric mean of queries char counts: {geometric_mean_query_char_count}," - + ( - f" Geometric mean of bytes processed: {geometric_mean_bytes}," - if has_full_metrics - else "" - ) + + f" Geometric mean of bytes processed: {geometric_mean_bytes}," + ( f" Geometric mean of slot millis: {geometric_mean_slot_millis}," if has_full_metrics From 33881914ab5b8d0e701eabd9c731aed1deab3d49 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 29 Apr 2025 18:11:41 -0700 Subject: [PATCH 25/28] feat: support names parameter in read_csv for bigquery engine (#1659) --- bigframes/core/utils.py | 6 +- bigframes/session/__init__.py | 36 +++--- .../session/_io/bigquery/read_gbq_table.py | 14 +++ bigframes/session/loader.py | 64 +++++++---- tests/system/small/test_session.py | 108 +++++++++++++++++- tests/unit/session/test_session.py | 24 +++- .../pandas/io/parsers/readers.py | 2 +- 7 files changed, 205 insertions(+), 49 deletions(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 9731857ea0..495523d2fc 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -41,8 +41,10 @@ def get_axis_number(axis: typing.Union[str, int]) -> typing.Literal[0, 1]: raise ValueError(f"Not a valid axis: {axis}") -def is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: - return pd.api.types.is_list_like(obj) +def is_list_like( + obj: typing.Any, allow_sets: bool = True +) -> typing_extensions.TypeGuard[typing.Sequence]: + return pd.api.types.is_list_like(obj, allow_sets=allow_sets) def is_dict_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Mapping]: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c38d124196..6801937fbe 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -16,6 +16,7 @@ from __future__ import annotations +from collections import abc import datetime import logging import os @@ -569,7 +570,7 @@ def read_gbq_table( columns = col_order return self._loader.read_gbq_table( - query=query, + table_id=query, index_col=index_col, columns=columns, max_results=max_results, @@ -953,14 +954,21 @@ def _read_csv_w_bigquery_engine( native CSV loading capabilities, making it suitable for large datasets that may not fit into local memory. """ - - if any(param is not None for param in (dtype, names)): - not_supported = ("dtype", "names") + if dtype is not None: raise NotImplementedError( - f"BigQuery engine does not support these arguments: {not_supported}. " + f"BigQuery engine does not support the `dtype` argument." f"{constants.FEEDBACK_LINK}" ) + if names is not None: + if len(names) != len(set(names)): + raise ValueError("Duplicated names are not allowed.") + if not ( + bigframes.core.utils.is_list_like(names, allow_sets=False) + or isinstance(names, abc.KeysView) + ): + raise ValueError("Names should be an ordered collection.") + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") @@ -1004,11 +1012,9 @@ def _read_csv_w_bigquery_engine( elif header > 0: job_config.skip_leading_rows = header + 1 - return self._loader.read_bigquery_load_job( - filepath_or_buffer, - job_config=job_config, - index_col=index_col, - columns=columns, + table_id = self._loader.load_file(filepath_or_buffer, job_config=job_config) + return self._loader.read_gbq_table( + table_id, index_col=index_col, columns=columns, names=names ) def read_pickle( @@ -1049,8 +1055,8 @@ def read_parquet( job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.PARQUET job_config.labels = {"bigframes-api": "read_parquet"} - - return self._loader.read_bigquery_load_job(path, job_config=job_config) + table_id = self._loader.load_file(path, job_config=job_config) + return self._loader.read_gbq_table(table_id) else: if "*" in path: raise ValueError( @@ -1121,10 +1127,8 @@ def read_json( job_config.encoding = encoding job_config.labels = {"bigframes-api": "read_json"} - return self._loader.read_bigquery_load_job( - path_or_buf, - job_config=job_config, - ) + table_id = self._loader.load_file(path_or_buf, job_config=job_config) + return self._loader.read_gbq_table(table_id) else: if any(arg in kwargs for arg in ("chunksize", "iterator")): raise NotImplementedError( diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 34183b22bc..8d8f247185 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -235,6 +235,8 @@ def get_index_cols( | Iterable[int] | int | bigframes.enums.DefaultIndexKind, + *, + names: Optional[Iterable[str]] = None, ) -> List[str]: """ If we can get a total ordering from the table, such as via primary key @@ -245,6 +247,14 @@ def get_index_cols( # Transform index_col -> index_cols so we have a variable that is # always a list of column names (possibly empty). schema_len = len(table.schema) + + # If the `names` is provided, the index_col provided by the user is the new + # name, so we need to rename it to the original name in the table schema. + renamed_schema: Optional[Dict[str, str]] = None + if names is not None: + assert len(list(names)) == schema_len + renamed_schema = {name: field.name for name, field in zip(names, table.schema)} + index_cols: List[str] = [] if isinstance(index_col, bigframes.enums.DefaultIndexKind): if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: @@ -261,6 +271,8 @@ def get_index_cols( f"Got unexpected index_col {repr(index_col)}. {constants.FEEDBACK_LINK}" ) elif isinstance(index_col, str): + if renamed_schema is not None: + index_col = renamed_schema.get(index_col, index_col) index_cols = [index_col] elif isinstance(index_col, int): if not 0 <= index_col < schema_len: @@ -272,6 +284,8 @@ def get_index_cols( elif isinstance(index_col, Iterable): for item in index_col: if isinstance(item, str): + if renamed_schema is not None: + item = renamed_schema.get(item, item) index_cols.append(item) elif isinstance(item, int): if not 0 <= item < schema_len: diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 76f12ae438..e6b24e016c 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -348,7 +348,7 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): def read_gbq_table( self, - query: str, + table_id: str, *, index_col: Iterable[str] | str @@ -356,6 +356,7 @@ def read_gbq_table( | int | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), + names: Optional[Iterable[str]] = None, max_results: Optional[int] = None, api_name: str = "read_gbq_table", use_cache: bool = True, @@ -375,7 +376,7 @@ def read_gbq_table( ) table_ref = google.cloud.bigquery.table.TableReference.from_string( - query, default_project=self._bqclient.project + table_id, default_project=self._bqclient.project ) columns = list(columns) @@ -411,12 +412,37 @@ def read_gbq_table( f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?" ) + # TODO(b/408499371): check `names` work with `use_cols` for read_csv method. + if names is not None: + len_names = len(list(names)) + len_columns = len(table.schema) + if len_names > len_columns: + raise ValueError( + f"Too many columns specified: expected {len_columns}" + f" and found {len_names}" + ) + elif len_names < len_columns: + if ( + isinstance(index_col, bigframes.enums.DefaultIndexKind) + or index_col != () + ): + raise KeyError( + "When providing both `index_col` and `names`, ensure the " + "number of `names` matches the number of columns in your " + "data." + ) + index_col = range(len_columns - len_names) + names = [ + field.name for field in table.schema[: len_columns - len_names] + ] + list(names) + # Converting index_col into a list of column names requires # the table metadata because we might use the primary keys # when constructing the index. index_cols = bf_read_gbq_table.get_index_cols( table=table, index_col=index_col, + names=names, ) _check_column_duplicates(index_cols, columns) @@ -443,7 +469,7 @@ def read_gbq_table( # TODO(b/338419730): We don't need to fallback to a query for wildcard # tables if we allow some non-determinism when time travel isn't supported. if max_results is not None or bf_io_bigquery.is_table_with_wildcard_suffix( - query + table_id ): # TODO(b/338111344): If we are running a query anyway, we might as # well generate ROW_NUMBER() at the same time. @@ -451,7 +477,7 @@ def read_gbq_table( itertools.chain(index_cols, columns) if columns else () ) query = bf_io_bigquery.to_query( - query, + table_id, columns=all_columns, sql_predicate=bf_io_bigquery.compile_filters(filters) if filters @@ -561,6 +587,15 @@ def read_gbq_table( index_names = [None] value_columns = [col for col in array_value.column_ids if col not in index_cols] + if names is not None: + renamed_cols: Dict[str, str] = { + col: new_name for col, new_name in zip(array_value.column_ids, names) + } + index_names = [ + renamed_cols.get(index_col, index_col) for index_col in index_cols + ] + value_columns = [renamed_cols.get(col, col) for col in value_columns] + block = blocks.Block( array_value, index_columns=index_cols, @@ -576,18 +611,12 @@ def read_gbq_table( df.sort_index() return df - def read_bigquery_load_job( + def load_file( self, filepath_or_buffer: str | IO["bytes"], *, job_config: bigquery.LoadJobConfig, - index_col: Iterable[str] - | str - | Iterable[int] - | int - | bigframes.enums.DefaultIndexKind = (), - columns: Iterable[str] = (), - ) -> dataframe.DataFrame: + ) -> str: # Need to create session table beforehand table = self._storage_manager.create_temp_table(_PLACEHOLDER_SCHEMA) # but, we just overwrite the placeholder schema immediately with the load job @@ -615,16 +644,7 @@ def read_bigquery_load_job( self._start_generic_job(load_job) table_id = f"{table.project}.{table.dataset_id}.{table.table_id}" - - # The BigQuery REST API for tables.get doesn't take a session ID, so we - # can't get the schema for a temp table that way. - - return self.read_gbq_table( - query=table_id, - index_col=index_col, - columns=columns, - api_name="read_gbq_table", - ) + return table_id def read_gbq_query( self, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index c7bf5b3f5e..ced01c940f 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -59,7 +59,7 @@ def df_and_local_csv(scalars_df_index): with tempfile.TemporaryDirectory() as dir: # Prepares local CSV file for reading - path = dir + "/write_df_to_local_csv_file.csv" + path = dir + "/test_read_csv_w_local_csv.csv" scalars_df_index.to_csv(path, index=True) yield scalars_df_index, path @@ -71,7 +71,19 @@ def df_and_gcs_csv(scalars_df_index, gcs_folder): drop_columns = ["bytes_col", "datetime_col", "numeric_col", "geography_col"] scalars_df_index = scalars_df_index.drop(columns=drop_columns) - path = gcs_folder + "test_read_csv_w_write_engine*.csv" + path = gcs_folder + "test_read_csv_w_gcs_csv*.csv" + read_path = utils.get_first_file_from_wildcard(path) + scalars_df_index.to_csv(path, index=True) + return scalars_df_index, read_path + + +@pytest.fixture(scope="module") +def df_and_gcs_csv_for_two_columns(scalars_df_index, gcs_folder): + # Some tests require only two columns to be present in the CSV file. + selected_cols = ["bool_col", "int64_col"] + scalars_df_index = scalars_df_index[selected_cols] + + path = gcs_folder + "df_and_gcs_csv_for_two_columns*.csv" read_path = utils.get_first_file_from_wildcard(path) scalars_df_index.to_csv(path, index=True) return scalars_df_index, read_path @@ -1260,6 +1272,98 @@ def test_read_csv_raises_error_for_invalid_index_col( session.read_csv(path, engine="bigquery", index_col=index_col) +def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns): + _, path = df_and_gcs_csv_for_two_columns + + names = ["a", "b", "c"] + bf_df = session.read_csv(path, engine="bigquery", names=names) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv(path, names=names, dtype=bf_df.dtypes.to_dict()) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index(names[0]).sort_index() + pd_df = pd_df.set_index(names[0]) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_names_more_than_columns_can_raise_error( + session, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + names = ["a", "b", "c", "d"] + with pytest.raises( + ValueError, + match="Too many columns specified: expected 3 and found 4", + ): + session.read_csv(path, engine="bigquery", names=names) + + +def test_read_csv_for_names_less_than_columns(session, df_and_gcs_csv_for_two_columns): + _, path = df_and_gcs_csv_for_two_columns + + names = ["b", "c"] + bf_df = session.read_csv(path, engine="bigquery", names=names) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv(path, names=names, dtype=bf_df.dtypes.to_dict()) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.sort_index() + + # Pandas's index name is None, while BigFrames's index name is "rowindex". + pd_df.index.name = "rowindex" + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_names_less_than_columns_raise_error_when_index_col_set( + session, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + + names = ["b", "c"] + with pytest.raises( + KeyError, + match="ensure the number of `names` matches the number of columns in your data.", + ): + session.read_csv(path, engine="bigquery", names=names, index_col="rowindex") + + +@pytest.mark.parametrize( + "index_col", + [ + pytest.param("a", id="single_str"), + pytest.param(["a", "b"], id="multi_str"), + pytest.param(0, id="single_int"), + ], +) +def test_read_csv_for_names_and_index_col( + session, df_and_gcs_csv_for_two_columns, index_col +): + _, path = df_and_gcs_csv_for_two_columns + names = ["a", "b", "c"] + bf_df = session.read_csv(path, engine="bigquery", index_col=index_col, names=names) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + path, index_col=index_col, names=names, dtype=bf_df.dtypes.to_dict() + ) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + pd.testing.assert_frame_equal( + bf_df.to_pandas(), pd_df.to_pandas(), check_index_type=False + ) + + @pytest.mark.parametrize( ("kwargs", "match"), [ diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 22b439a38b..91b6679702 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -108,14 +108,9 @@ @pytest.mark.parametrize( ("kwargs", "match"), [ - pytest.param( - {"engine": "bigquery", "names": []}, - "BigQuery engine does not support these arguments", - id="with_names", - ), pytest.param( {"engine": "bigquery", "dtype": {}}, - "BigQuery engine does not support these arguments", + "BigQuery engine does not support the `dtype` argument", id="with_dtype", ), pytest.param( @@ -203,6 +198,23 @@ def test_read_csv_with_incompatible_write_engine(engine, write_engine): ) +@pytest.mark.parametrize( + ("names", "error_message"), + ( + pytest.param("abc", "Names should be an ordered collection."), + pytest.param({"a", "b", "c"}, "Names should be an ordered collection."), + pytest.param(["a", "a"], "Duplicated names are not allowed."), + ), +) +def test_read_csv_w_bigquery_engine_raises_error_for_invalid_names( + names, error_message +): + session = mocks.create_bigquery_session() + + with pytest.raises(ValueError, match=error_message): + session.read_csv("path/to/csv.csv", engine="bigquery", names=names) + + @pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")]) def test_read_gbq_missing_parts(missing_parts_table_id): session = mocks.create_bigquery_session() diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 2b1e3dd70b..4757f5ed9d 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -114,7 +114,7 @@ def read_csv( names (default None): a list of column names to use. If the file contains a header row and you want to pass this parameter, then `header=0` should be passed as well so the - first (header) row is ignored. Only to be used with default engine. + first (header) row is ignored. index_col (default None): column(s) to use as the row labels of the DataFrame, either given as string name or column index. `index_col=False` can be used with the default From 9ac81351deac9bec8174d3393abdcdef29e30c2a Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 30 Apr 2025 11:40:55 -0700 Subject: [PATCH 26/28] refactor: implements `compile_selection` method (#1672) --- bigframes/core/compile/sqlglot/compiler.py | 15 ++++++--- .../core/compile/sqlglot/scalar_compiler.py | 33 +++++++++++++++++++ bigframes/core/compile/sqlglot/sqlglot_ir.py | 16 ++++++++- .../test_compile_readlocal/out.sql | 6 +++- 4 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 bigframes/core/compile/sqlglot/scalar_compiler.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 2b78a6e458..5826ab2871 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -20,9 +20,11 @@ from google.cloud import bigquery import pyarrow as pa +import sqlglot.expressions as sge from bigframes.core import expression, identifiers, nodes, rewrite from bigframes.core.compile import configs +import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.core.ordering as bf_ordering @@ -38,7 +40,7 @@ def compile( ordered: bool = True, limit: typing.Optional[int] = None, ) -> str: - """Compile node into sql where rows are sorted with ORDER BY.""" + """Compiles node into sql where rows are sorted with ORDER BY.""" request = configs.CompileRequest(node, sort_rows=ordered, peek_count=limit) return self._compile_sql(request).sql @@ -48,7 +50,7 @@ def compile_raw( ) -> typing.Tuple[ str, typing.Sequence[bigquery.SchemaField], bf_ordering.RowOrdering ]: - """Compile node into sql that exposes all columns, including hidden + """Compiles node into sql that exposes all columns, including hidden ordering-only columns.""" request = configs.CompileRequest( node, sort_rows=False, materialize_all_order_keys=True @@ -163,6 +165,9 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: @_compile_node.register -def compile_selection(node: nodes.SelectionNode, child: ir.SQLGlotIR): - # TODO: add support for selection - return child +def compile_selection(node: nodes.SelectionNode, child: ir.SQLGlotIR) -> ir.SQLGlotIR: + select_cols: typing.Dict[str, sge.Expression] = { + id.name: scalar_compiler.compile_scalar_expression(expr) + for expr, id in node.input_output_pairs + } + return child.select(select_cols) diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py new file mode 100644 index 0000000000..f9197ef745 --- /dev/null +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools + +import sqlglot.expressions as sge + +from bigframes.core import expression + + +@functools.singledispatch +def compile_scalar_expression( + expression: expression.Expression, +) -> sge.Expression: + """Compiles BigFrames scalar expression into SQLGlot expression.""" + raise ValueError(f"Can't compile unrecognized node: {expression}") + + +@compile_scalar_expression.register +def compile_deref_op(expr: expression.DerefOp): + return sge.ColumnDef(this=sge.to_identifier(expr.id.sql, quoted=True)) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 836e7f17ba..fe85f22494 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -30,7 +30,7 @@ class SQLGlotIR: """Helper class to build SQLGlot Query and generate SQL string.""" - expr: sge.Expression = sge.Expression() + expr: sge.Select = sg.select() """The SQLGlot expression representing the query.""" dialect = sqlglot.dialects.bigquery.BigQuery @@ -90,6 +90,20 @@ def from_pandas( ) return cls(expr=sg.select(sge.Star()).from_(expr)) + def select( + self, + select_cols: typing.Dict[str, sge.Expression], + ) -> SQLGlotIR: + selected_cols = [ + sge.Alias( + this=expr, + alias=sge.to_identifier(id, quoted=self.quoted), + ) + for id, expr in select_cols.items() + ] + expr = self.expr.select(*selected_cols, append=False) + return SQLGlotIR(expr=expr) + def _literal(value: typing.Any, dtype: str) -> sge.Expression: if value is None: diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index 5c22910422..c2a443dab2 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -1,3 +1,7 @@ SELECT - * + `bfcol_0` AS `bfcol_5`, + `bfcol_1` AS `bfcol_6`, + `bfcol_2` AS `bfcol_7`, + `bfcol_3` AS `bfcol_8`, + `bfcol_4` AS `bfcol_9` FROM UNNEST(ARRAY>[(1, -10, TRUE, 'b', 0), (2, 20, CAST(NULL AS BOOLEAN), 'aa', 1), (3, 30, FALSE, 'ccc', 2)]) \ No newline at end of file From 36a3314aba36d6699d12ebfe905dbaac4ee1f4cd Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 30 Apr 2025 13:25:41 -0700 Subject: [PATCH 27/28] refactor: support all data types for readlocal compiler (#1666) --- bigframes/core/compile/sqlglot/compiler.py | 11 +- bigframes/core/compile/sqlglot/sqlglot_ir.py | 68 +++++-- tests/data/scalars.jsonl | 18 +- tests/unit/core/compile/sqlglot/conftest.py | 92 ++++++++- .../test_compile_readlocal/out.sql | 176 +++++++++++++++++- .../test_compile_readlocal_w_json_df/out.sql | 4 + .../test_compile_readlocal_w_lists_df/out.sql | 41 ++++ .../out.sql | 19 ++ .../out.sql | 21 +++ .../compile/sqlglot/test_compile_readlocal.py | 31 ++- .../sql/compilers/bigquery/__init__.py | 1 - 11 files changed, 427 insertions(+), 55 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 5826ab2871..cb510ce365 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -146,22 +146,17 @@ def _compile_node( @_compile_node.register def compile_readlocal(node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: - offsets = node.offsets_col.sql if node.offsets_col else None - schema_names = node.schema.names - schema_dtypes = node.schema.dtypes - pa_table = node.local_data_source.data pa_table = pa_table.select([item.source_id for item in node.scan_list.items]) - pa_table = pa_table.rename_columns( - {item.source_id: item.id.sql for item in node.scan_list.items} - ) + pa_table = pa_table.rename_columns([item.id.sql for item in node.scan_list.items]) + offsets = node.offsets_col.sql if node.offsets_col else None if offsets: pa_table = pa_table.append_column( offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) ) - return ir.SQLGlotIR.from_pandas(pa_table.to_pandas(), schema_names, schema_dtypes) + return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema) @_compile_node.register diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index fe85f22494..607e712a2b 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -17,13 +17,23 @@ import dataclasses import typing -import pandas as pd +import pyarrow as pa import sqlglot as sg import sqlglot.dialects.bigquery import sqlglot.expressions as sge from bigframes import dtypes import bigframes.core.compile.sqlglot.sqlglot_types as sgt +import bigframes.core.local_data as local_data +import bigframes.core.schema as schemata + +# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. +try: + from shapely.io import to_wkt # type: ignore +except ImportError: + from shapely.wkt import dumps # type: ignore + + to_wkt = dumps @dataclasses.dataclass(frozen=True) @@ -48,35 +58,32 @@ def sql(self) -> str: return self.expr.sql(dialect=self.dialect, pretty=self.pretty) @classmethod - def from_pandas( - cls, - pd_df: pd.DataFrame, - schema_names: typing.Sequence[str], - schema_dtypes: typing.Sequence[dtypes.Dtype], + def from_pyarrow( + cls, pa_table: pa.Table, schema: schemata.ArraySchema ) -> SQLGlotIR: """Builds SQLGlot expression from pyarrow table.""" dtype_expr = sge.DataType( this=sge.DataType.Type.STRUCT, expressions=[ sge.ColumnDef( - this=sge.to_identifier(name, quoted=True), - kind=sgt.SQLGlotType.from_bigframes_dtype(dtype), + this=sge.to_identifier(field.column, quoted=True), + kind=sgt.SQLGlotType.from_bigframes_dtype(field.dtype), ) - for name, dtype in zip(schema_names, schema_dtypes) + for field in schema.items ], nested=True, ) data_expr = [ - sge.Tuple( + sge.Struct( expressions=tuple( _literal( value=value, - dtype=sgt.SQLGlotType.from_bigframes_dtype(dtype), + dtype=field.dtype, ) - for value, dtype in zip(row, schema_dtypes) + for value, field in zip(tuple(row_dict.values()), schema.items) ) ) - for _, row in pd_df.iterrows() + for row_dict in local_data._iter_table(pa_table, schema) ] expr = sge.Unnest( expressions=[ @@ -105,13 +112,36 @@ def select( return SQLGlotIR(expr=expr) -def _literal(value: typing.Any, dtype: str) -> sge.Expression: +def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: + sqlglot_type = sgt.SQLGlotType.from_bigframes_dtype(dtype) if value is None: - return _cast(sge.Null(), dtype) - - # TODO: handle other types like visit_DefaultLiteral - return sge.convert(value) + return _cast(sge.Null(), sqlglot_type) + elif dtype == dtypes.BYTES_DTYPE: + return _cast(str(value), sqlglot_type) + elif dtypes.is_time_like(dtype): + return _cast(sge.convert(value.isoformat()), sqlglot_type) + elif dtypes.is_geo_like(dtype): + wkt = value if isinstance(value, str) else to_wkt(value) + return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt)) + elif dtype == dtypes.JSON_DTYPE: + return sge.ParseJSON(this=sge.convert(str(value))) + elif dtypes.is_struct_like(dtype): + items = [ + _literal(value=value[field_name], dtype=field_dtype).as_( + field_name, quoted=True + ) + for field_name, field_dtype in dtypes.get_struct_fields(dtype).items() + ] + return sge.Struct.from_arg_list(items) + elif dtypes.is_array_like(dtype): + value_type = dtypes.get_array_inner_type(dtype) + values = sge.Array( + expressions=[_literal(value=v, dtype=value_type) for v in value] + ) + return values if len(value) > 0 else _cast(values, sqlglot_type) + else: + return sge.convert(value) -def _cast(arg, to) -> sge.Cast: +def _cast(arg: typing.Any, to: str) -> sge.Cast: return sge.Cast(this=arg, to=to) diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 03755c94b7..2e5a1499b9 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -1,9 +1,9 @@ -{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} -{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} -{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} -{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file +{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} +{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "LINESTRING(-0.127959 51.507728, -0.127026 51.507473)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} +{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 23456fc76c..4d871fd707 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -12,29 +12,101 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib + import pandas as pd +import pyarrow as pa import pytest +from bigframes import dtypes +import tests.system.utils + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" + -@pytest.fixture(scope="module") +@pytest.fixture(scope="session") def compiler_session(): from . import compiler_session return compiler_session.SQLCompilerSession() -@pytest.fixture(scope="module") -def all_types_df() -> pd.DataFrame: - # TODO: all types pandas dataframes +@pytest.fixture(scope="session") +def scalars_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing all scalar types and using the `rowindex` + column as the index.""" # TODO: add tests for empty dataframes + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + tests.system.utils.convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + return df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing STRUCT types and using the `id` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) + return df + + +@pytest.fixture(scope="session") +def repeated_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing LIST types and using the `rowindex` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + df = df.set_index("rowindex") + return df + + +@pytest.fixture(scope="session") +def json_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing JSON types and using the `rowindex` + column as the index.""" + json_data = [ + "null", + "true", + "100", + "0.98", + '"a string"', + "[]", + "[1, 2, 3]", + '[{"a": 1}, {"a": 2}, {"a": null}, {}]', + '"100"', + '{"date": "2024-07-16"}', + '{"int_value": 2, "null_filed": null}', + '{"list_data": [10, 20, 30]}', + ] df = pd.DataFrame( { - "int1": pd.Series([1, 2, 3], dtype="Int64"), - "int2": pd.Series([-10, 20, 30], dtype="Int64"), - "bools": pd.Series([True, None, False], dtype="boolean"), - "strings": pd.Series(["b", "aa", "ccc"], dtype="string[pyarrow]"), + "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), }, + index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), ) - # add more complexity index. - df.index = df.index.astype("Int64") return df diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index c2a443dab2..0ef80dc8b0 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -1,7 +1,171 @@ SELECT - `bfcol_0` AS `bfcol_5`, - `bfcol_1` AS `bfcol_6`, - `bfcol_2` AS `bfcol_7`, - `bfcol_3` AS `bfcol_8`, - `bfcol_4` AS `bfcol_9` -FROM UNNEST(ARRAY>[(1, -10, TRUE, 'b', 0), (2, 20, CAST(NULL AS BOOLEAN), 'aa', 1), (3, 30, FALSE, 'ccc', 2)]) \ No newline at end of file + `bfcol_0` AS `bfcol_16`, + `bfcol_1` AS `bfcol_17`, + `bfcol_2` AS `bfcol_18`, + `bfcol_3` AS `bfcol_19`, + `bfcol_4` AS `bfcol_20`, + `bfcol_5` AS `bfcol_21`, + `bfcol_6` AS `bfcol_22`, + `bfcol_7` AS `bfcol_23`, + `bfcol_8` AS `bfcol_24`, + `bfcol_9` AS `bfcol_25`, + `bfcol_10` AS `bfcol_26`, + `bfcol_11` AS `bfcol_27`, + `bfcol_12` AS `bfcol_28`, + `bfcol_13` AS `bfcol_29`, + `bfcol_14` AS `bfcol_30`, + `bfcol_15` AS `bfcol_31` +FROM UNNEST(ARRAY>[STRUCT( + 0, + TRUE, + CAST(b'Hello, World!' AS BYTES), + CAST('2021-07-21' AS DATE), + CAST('2021-07-21T11:39:45' AS DATETIME), + ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'), + 123456789, + 0, + 1.234567890, + 1.25, + 0, + 0, + 'Hello, World!', + CAST('11:41:43.076160' AS TIME), + CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + 0 +), STRUCT( + 1, + FALSE, + CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), + CAST('1991-02-03' AS DATE), + CAST('1991-01-02T03:45:06' AS DATETIME), + ST_GEOGFROMTEXT('POINT (-71.104 42.315)'), + -987654321, + 1, + 1.234567890, + 2.51, + 1, + 1, + 'こんにちは', + CAST('11:14:34.701606' AS TIME), + CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + 1 +), STRUCT( + 2, + TRUE, + CAST(b'\xc2\xa1Hola Mundo!' AS BYTES), + CAST('2023-03-01' AS DATE), + CAST('2023-03-01T10:55:13' AS DATETIME), + ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'), + 314159, + 0, + 101.101010100, + 25000000000.0, + 2, + 2, + ' ¡Hola Mundo! ', + CAST('23:59:59.999999' AS TIME), + CAST('2023-03-01T10:55:13.250125+00:00' AS TIMESTAMP), + 2 +), STRUCT( + 3, + CAST(NULL AS BOOLEAN), + CAST(NULL AS BYTES), + CAST(NULL AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + CAST(NULL AS INT64), + 1, + CAST(NULL AS NUMERIC), + CAST(NULL AS FLOAT64), + 3, + 3, + CAST(NULL AS STRING), + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 3 +), STRUCT( + 4, + FALSE, + CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), + CAST('2021-07-21' AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + -234892, + -2345, + CAST(NULL AS NUMERIC), + CAST(NULL AS FLOAT64), + 4, + 4, + 'Hello, World!', + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 4 +), STRUCT( + 5, + FALSE, + CAST(b'G\xc3\xbcten Tag' AS BYTES), + CAST('1980-03-14' AS DATE), + CAST('1980-03-14T15:16:17' AS DATETIME), + CAST(NULL AS GEOGRAPHY), + 55555, + 0, + 5.555555000, + 555.555, + 5, + 5, + 'Güten Tag!', + CAST('15:16:17.181921' AS TIME), + CAST('1980-03-14T15:16:17.181921+00:00' AS TIMESTAMP), + 5 +), STRUCT( + 6, + TRUE, + CAST(b'Hello\tBigFrames!\x07' AS BYTES), + CAST('2023-05-23' AS DATE), + CAST('2023-05-23T11:37:01' AS DATETIME), + ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'), + 101202303, + 2, + -10.090807000, + -123.456, + 6, + 6, + 'capitalize, This ', + CAST('01:02:03.456789' AS TIME), + CAST('2023-05-23T11:42:55.000001+00:00' AS TIMESTAMP), + 6 +), STRUCT( + 7, + TRUE, + CAST(NULL AS BYTES), + CAST('2038-01-20' AS DATE), + CAST('2038-01-19T03:14:08' AS DATETIME), + CAST(NULL AS GEOGRAPHY), + -214748367, + 2, + 11111111.100000000, + 42.42, + 7, + 7, + ' سلام', + CAST('12:00:00.000001' AS TIME), + CAST('2038-01-19T03:14:17.999999+00:00' AS TIMESTAMP), + 7 +), STRUCT( + 8, + FALSE, + CAST(NULL AS BYTES), + CAST(NULL AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + 2, + 1, + CAST(NULL AS NUMERIC), + 6.87, + 8, + 8, + 'T', + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 8 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql new file mode 100644 index 0000000000..3b780e6d8e --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -0,0 +1,4 @@ +SELECT + `bfcol_0` AS `bfcol_2`, + `bfcol_1` AS `bfcol_3` +FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql new file mode 100644 index 0000000000..6998b41b27 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql @@ -0,0 +1,41 @@ +SELECT + `bfcol_0` AS `bfcol_9`, + `bfcol_1` AS `bfcol_10`, + `bfcol_2` AS `bfcol_11`, + `bfcol_3` AS `bfcol_12`, + `bfcol_4` AS `bfcol_13`, + `bfcol_5` AS `bfcol_14`, + `bfcol_6` AS `bfcol_15`, + `bfcol_7` AS `bfcol_16`, + `bfcol_8` AS `bfcol_17` +FROM UNNEST(ARRAY, `bfcol_2` ARRAY, `bfcol_3` ARRAY, `bfcol_4` ARRAY, `bfcol_5` ARRAY, `bfcol_6` ARRAY, `bfcol_7` ARRAY, `bfcol_8` INT64>>[STRUCT( + 0, + [1], + [TRUE], + [1.2, 2.3], + ['2021-07-21'], + ['2021-07-21 11:39:45'], + [1.2, 2.3, 3.4], + ['abc', 'de', 'f'], + 0 +), STRUCT( + 1, + [1, 2], + [TRUE, FALSE], + [1.1], + ['2021-07-21', '1987-03-28'], + ['1999-03-14 17:22:00'], + [5.5, 2.3], + ['a', 'bc', 'de'], + 1 +), STRUCT( + 2, + [1, 2, 3], + [TRUE], + [0.5, -1.9, 2.3], + ['2017-08-01', '2004-11-22'], + ['1979-06-03 03:20:45'], + [1.7000000000000002], + ['', 'a'], + 2 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql new file mode 100644 index 0000000000..42b7bc7361 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql @@ -0,0 +1,19 @@ +SELECT + * +FROM UNNEST(ARRAY>, `bfcol_2` INT64>>[( + 1, + STRUCT( + 'Alice' AS `name`, + 30 AS `age`, + STRUCT('New York' AS `city`, 'USA' AS `country`) AS `address` + ), + 0 +), ( + 2, + STRUCT( + 'Bob' AS `name`, + 25 AS `age`, + STRUCT('London' AS `city`, 'UK' AS `country`) AS `address` + ), + 1 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql new file mode 100644 index 0000000000..99b94915bf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -0,0 +1,21 @@ +SELECT + `bfcol_0` AS `bfcol_3`, + `bfcol_1` AS `bfcol_4`, + `bfcol_2` AS `bfcol_5` +FROM UNNEST(ARRAY>, `bfcol_2` INT64>>[STRUCT( + 1, + STRUCT( + 'Alice' AS `name`, + 30 AS `age`, + STRUCT('New York' AS `city`, 'USA' AS `country`) AS `address` + ), + 0 +), STRUCT( + 2, + STRUCT( + 'Bob' AS `name`, + 25 AS `age`, + STRUCT('London' AS `city`, 'UK' AS `country`) AS `address` + ), + 1 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index ad7cc5785a..58587da129 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -22,7 +22,34 @@ def test_compile_readlocal( - all_types_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot + scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): - bf_df = bpd.DataFrame(all_types_df, session=compiler_session) + bf_df = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readlocal_w_structs_df( + nested_structs_pandas_df: pd.DataFrame, + compiler_session: bigframes.Session, + snapshot, +): + bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session) + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readlocal_w_lists_df( + repeated_pandas_df: pd.DataFrame, + compiler_session: bigframes.Session, + snapshot, +): + bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session) + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readlocal_w_json_df( + json_pandas_df: pd.DataFrame, + compiler_session: bigframes.Session, + snapshot, +): + bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 7e001d1ac3..be8f9fc555 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1067,7 +1067,6 @@ def visit_InMemoryTable(self, op, *, name, schema, data): columns=columns, ), ) - # return expr return sg.select(sge.Star()).from_(expr) def visit_ArrayAggregate(self, op, *, arg, order_by, where): From f3fd7e226ec28f7c01370adb6f2c250432b7e4b3 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 30 Apr 2025 16:59:59 -0700 Subject: [PATCH 28/28] chore(main): release 2.2.0 (#1643) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 43 +++++++++++++++++++++++ bigframes/version.py | 4 +-- third_party/bigframes_vendored/version.py | 4 +-- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b1e331d1d..b6c08af05e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,49 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.1.0...v2.2.0) (2025-04-30) + + +### Features + +* Add gemini-2.0-flash-001 and gemini-2.0-flash-lite-001 to fine tune score endponts and multimodal endpoints ([#1650](https://github.com/googleapis/python-bigquery-dataframes/issues/1650)) ([4fb54df](https://github.com/googleapis/python-bigquery-dataframes/commit/4fb54dfe448604a90fc1818cf18b1e77e1e7227b)) +* Add GeminiTextGenerator.predict structured output ([#1653](https://github.com/googleapis/python-bigquery-dataframes/issues/1653)) ([6199023](https://github.com/googleapis/python-bigquery-dataframes/commit/6199023a6a71e72e926f5879e74a15215bc6e4a0)) +* DataFrames.__getitem__ support for slice input ([#1668](https://github.com/googleapis/python-bigquery-dataframes/issues/1668)) ([563f0cb](https://github.com/googleapis/python-bigquery-dataframes/commit/563f0cbdf4a18c3cd1bd2a4b52de823165638911)) +* Print right origin of `PreviewWarning` for the `bpd.udf` ([#1629](https://github.com/googleapis/python-bigquery-dataframes/issues/1629)) ([48d10d1](https://github.com/googleapis/python-bigquery-dataframes/commit/48d10d1f0150a29dd3b91f505f8d3874e0b88c42)) +* Session.bytes_processed_sum will be updated when allow_large_re… ([#1669](https://github.com/googleapis/python-bigquery-dataframes/issues/1669)) ([ae312db](https://github.com/googleapis/python-bigquery-dataframes/commit/ae312dbed25da6da5e2817d5c9838654c2a1ad1c)) +* Short circuit query for local scan ([#1618](https://github.com/googleapis/python-bigquery-dataframes/issues/1618)) ([e84f232](https://github.com/googleapis/python-bigquery-dataframes/commit/e84f232b0fc5e2167a7cddb355cf0c8837ae5422)) +* Support names parameter in read_csv for bigquery engine ([#1659](https://github.com/googleapis/python-bigquery-dataframes/issues/1659)) ([3388191](https://github.com/googleapis/python-bigquery-dataframes/commit/33881914ab5b8d0e701eabd9c731aed1deab3d49)) +* Support passing list of values to bigframes.core.sql.simple_literal ([#1641](https://github.com/googleapis/python-bigquery-dataframes/issues/1641)) ([102d363](https://github.com/googleapis/python-bigquery-dataframes/commit/102d363aa7e3245ff262c817bc756ea0eaee57e7)) +* Support write api as loading option ([#1617](https://github.com/googleapis/python-bigquery-dataframes/issues/1617)) ([c46ad06](https://github.com/googleapis/python-bigquery-dataframes/commit/c46ad0647785a9207359eba0fb5b6f7a16610f2a)) + + +### Bug Fixes + +* DataFrame accessors is not pupulated ([#1639](https://github.com/googleapis/python-bigquery-dataframes/issues/1639)) ([28afa2c](https://github.com/googleapis/python-bigquery-dataframes/commit/28afa2c73c0517f9365fab05193706631b656551)) +* Prefer remote schema instead of throwing on materialize conflicts ([#1644](https://github.com/googleapis/python-bigquery-dataframes/issues/1644)) ([53fc25b](https://github.com/googleapis/python-bigquery-dataframes/commit/53fc25bfc86e166b91e5001506051b1cac34c996)) +* Remove itertools.pairwise usage ([#1638](https://github.com/googleapis/python-bigquery-dataframes/issues/1638)) ([9662745](https://github.com/googleapis/python-bigquery-dataframes/commit/9662745265c8c6e42f372629bd2c7806542cee1a)) +* Resolve issue where pre-release versions of google-auth are installed ([#1491](https://github.com/googleapis/python-bigquery-dataframes/issues/1491)) ([ebb7a5e](https://github.com/googleapis/python-bigquery-dataframes/commit/ebb7a5e2b24fa57d6fe6a76d9b857ad44c67d194)) +* Resolve some of the typo errors ([#1655](https://github.com/googleapis/python-bigquery-dataframes/issues/1655)) ([cd7fbde](https://github.com/googleapis/python-bigquery-dataframes/commit/cd7fbde026522f53a23a4bb6585ad8629769fad1)) + + +### Performance Improvements + +* Fold row count ops when known ([#1656](https://github.com/googleapis/python-bigquery-dataframes/issues/1656)) ([c958dbe](https://github.com/googleapis/python-bigquery-dataframes/commit/c958dbea32b77cec9fddfc09e3b40d1da220a42c)) +* Use flyweight for node fields ([#1654](https://github.com/googleapis/python-bigquery-dataframes/issues/1654)) ([8482bfc](https://github.com/googleapis/python-bigquery-dataframes/commit/8482bfc1d4caa91a35c4fbf0be420301d05ad544)) + + +### Dependencies + +* Support shapely 1.8.5+ again ([#1651](https://github.com/googleapis/python-bigquery-dataframes/issues/1651)) ([ae83e61](https://github.com/googleapis/python-bigquery-dataframes/commit/ae83e61c49ade64d6f727e9f364bd2f1aeec6e19)) + + +### Documentation + +* Add JSON data types notebook ([#1647](https://github.com/googleapis/python-bigquery-dataframes/issues/1647)) ([9128c4a](https://github.com/googleapis/python-bigquery-dataframes/commit/9128c4a31dab487bc23f67c43380abd0beda5b1c)) +* Add sample code snippets for `udf` ([#1649](https://github.com/googleapis/python-bigquery-dataframes/issues/1649)) ([53caa8d](https://github.com/googleapis/python-bigquery-dataframes/commit/53caa8d689e64436f5313095ee27479a06d8e8a8)) +* Fix `bq_dataframes_template` notebook to work if partial ordering mode is enabled ([#1665](https://github.com/googleapis/python-bigquery-dataframes/issues/1665)) ([f442e7a](https://github.com/googleapis/python-bigquery-dataframes/commit/f442e7a07ff273ba3af74eeabafb62110b78f692)) +* Note that `udf` is in preview and must be python 3.11 compatible ([#1629](https://github.com/googleapis/python-bigquery-dataframes/issues/1629)) ([48d10d1](https://github.com/googleapis/python-bigquery-dataframes/commit/48d10d1f0150a29dd3b91f505f8d3874e0b88c42)) + ## [2.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.0.0...v2.1.0) (2025-04-22) diff --git a/bigframes/version.py b/bigframes/version.py index b671169b24..c6ca0ee57c 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.1.0" +__version__ = "2.2.0" # {x-release-please-start-date} -__release_date__ = "2025-04-22" +__release_date__ = "2025-04-30" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index b671169b24..c6ca0ee57c 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.1.0" +__version__ = "2.2.0" # {x-release-please-start-date} -__release_date__ = "2025-04-22" +__release_date__ = "2025-04-30" # {x-release-please-end}