diff --git a/bigframes/core/tools/bigquery_schema.py b/bigframes/core/tools/bigquery_schema.py index 227a69e0f7..eef7364a1b 100644 --- a/bigframes/core/tools/bigquery_schema.py +++ b/bigframes/core/tools/bigquery_schema.py @@ -18,6 +18,12 @@ import google.cloud.bigquery +_LEGACY_TO_GOOGLESQL_TYPES = { + "BOOLEAN": "BOOL", + "INTEGER": "INT64", + "FLOAT": "FLOAT64", +} + def _type_to_sql(field: google.cloud.bigquery.SchemaField): """Turn the type information of the field into SQL. @@ -26,7 +32,12 @@ def _type_to_sql(field: google.cloud.bigquery.SchemaField): """ if field.field_type.casefold() in ("record", "struct"): return _to_struct(field.fields) - return field.field_type + + # Map from legacy SQL names (the ones used in the BigQuery schema API) to + # the GoogleSQL types. Importantly, FLOAT is from legacy SQL, but not valid + # in GoogleSQL. See internal issue b/428190014. + type_ = _LEGACY_TO_GOOGLESQL_TYPES.get(field.field_type.upper(), field.field_type) + return type_ def _field_to_sql(field: google.cloud.bigquery.SchemaField): diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 608eaf5a82..003de5913f 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -218,8 +218,27 @@ def read_gbq( read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) +def _run_read_gbq_colab_sessionless_dry_run( + query: str, + *, + pyformat_args: Dict[str, Any], +) -> pandas.Series: + """Run a dry_run without a session.""" + + query_formatted = bigframes.core.pyformat.pyformat( + query, + pyformat_args=pyformat_args, + dry_run=True, + ) + bqclient = _get_bqclient() + job = _dry_run(query_formatted, bqclient) + return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + + def _try_read_gbq_colab_sessionless_dry_run( - create_query: Callable[[], str], + query: str, + *, + pyformat_args: Dict[str, Any], ) -> Optional[pandas.Series]: """Run a dry_run without a session, only if the session hasn't yet started.""" @@ -230,10 +249,9 @@ def _try_read_gbq_colab_sessionless_dry_run( # to local data and not any BigQuery tables. with _default_location_lock: if not config.options.bigquery._session_started: - bqclient = _get_bqclient() - query = create_query() - job = _dry_run(query, bqclient) - return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + return _run_read_gbq_colab_sessionless_dry_run( + query, pyformat_args=pyformat_args + ) # Explicitly return None to indicate that we didn't run the dry run query. return None @@ -286,21 +304,13 @@ def _read_gbq_colab( if pyformat_args is None: pyformat_args = {} - # Delay formatting the query with the special "session-less" logic. This - # avoids doing unnecessary work if the session already has a location or has - # already started. - create_query = functools.partial( - bigframes.core.pyformat.pyformat, - query_or_table, - pyformat_args=pyformat_args, - dry_run=True, - ) - # Only try to set the global location if it's not a dry run. We don't want # to bind to a location too early. This is especially important if the query # only refers to local data and not any BigQuery tables. if dry_run: - result = _try_read_gbq_colab_sessionless_dry_run(create_query) + result = _try_read_gbq_colab_sessionless_dry_run( + query_or_table, pyformat_args=pyformat_args + ) if result is not None: return result @@ -309,6 +319,15 @@ def _read_gbq_colab( # started. That means we can safely call the "real" _read_gbq_colab, # which generates slightly nicer SQL. else: + # Delay formatting the query with the special "session-less" logic. This + # avoids doing unnecessary work if the session already has a location or has + # already started. + create_query = functools.partial( + bigframes.core.pyformat.pyformat, + query_or_table, + pyformat_args=pyformat_args, + dry_run=True, + ) _set_default_session_location_if_possible_deferred_query(create_query) return global_session.with_default_session( diff --git a/tests/system/small/pandas/io/__init__.py b/tests/system/small/pandas/io/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/io/api/__init__.py b/tests/system/small/pandas/io/api/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/io/api/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/io/api/test_read_gbq_colab.py b/tests/system/small/pandas/io/api/test_read_gbq_colab.py new file mode 100644 index 0000000000..6e848ed9ea --- /dev/null +++ b/tests/system/small/pandas/io/api/test_read_gbq_colab.py @@ -0,0 +1,329 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import datetime +import decimal + +import db_dtypes # type: ignore +import geopandas # type: ignore +import numpy +import pandas +import pyarrow +import pytest +import shapely.geometry # type: ignore + +from bigframes.pandas.io import api as module_under_test + + +@pytest.mark.parametrize( + ("df_pd",), + ( + # Regression tests for b/428190014. + # + # Test every BigQuery type we support, especially those where the legacy + # SQL type name differs from the GoogleSQL type name. + # + # See: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # and compare to the legacy types at + # https://cloud.google.com/bigquery/docs/data-types + pytest.param( + pandas.DataFrame( + { + "ints": pandas.Series( + [[1], [2], [3]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())), + ), + "floats": pandas.Series( + [[1.0], [2.0], [3.0]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), + } + ), + id="arrays", + ), + pytest.param( + pandas.DataFrame( + { + "bool": pandas.Series([True, False, True], dtype="bool"), + "boolean": pandas.Series([True, None, True], dtype="boolean"), + "object": pandas.Series([True, None, True], dtype="object"), + "arrow": pandas.Series( + [True, None, True], dtype=pandas.ArrowDtype(pyarrow.bool_()) + ), + } + ), + id="bools", + ), + pytest.param( + pandas.DataFrame( + { + "bytes": pandas.Series([b"a", b"b", b"c"], dtype=numpy.bytes_), + "object": pandas.Series([b"a", None, b"c"], dtype="object"), + "arrow": pandas.Series( + [b"a", None, b"c"], dtype=pandas.ArrowDtype(pyarrow.binary()) + ), + } + ), + id="bytes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + } + ), + id="dates", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ), + "arrow": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + } + ), + id="datetimes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ], + dtype="object", + ), + "geopandas": geopandas.GeoSeries( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ] + ), + } + ), + id="geographys", + ), + # TODO(tswast): Add INTERVAL once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Is there an equivalent object type we can use here? + # TODO(tswast): Add built-in Arrow extension type + "db_dtypes": pandas.Series( + ["{}", None, "123"], + dtype=pandas.ArrowDtype(db_dtypes.JSONArrowType()), + ), + } + ), + id="jsons", + ), + pytest.param( + pandas.DataFrame( + { + "int64": pandas.Series([1, 2, 3], dtype="int64"), + "Int64": pandas.Series([1, None, 3], dtype="Int64"), + "object": pandas.Series([1, None, 3], dtype="object"), + "arrow": pandas.Series( + [1, None, 3], dtype=pandas.ArrowDtype(pyarrow.int64()) + ), + } + ), + id="ints", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype="object", + ), + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal128(38, 9)), + ), + } + ), + id="numerics", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for BIGNUMERIC. Can bigframes disambiguate? + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal256(76, 38)), + ), + } + ), + id="bignumerics", + ), + pytest.param( + pandas.DataFrame( + { + "float64": pandas.Series([1.23, None, 4.56], dtype="float64"), + "Float64": pandas.Series([1.23, None, 4.56], dtype="Float64"), + "object": pandas.Series([1.23, None, 4.56], dtype="object"), + "arrow": pandas.Series( + [1.23, None, 4.56], dtype=pandas.ArrowDtype(pyarrow.float64()) + ), + } + ), + id="floats", + ), + # TODO(tswast): Add RANGE once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + "string": pandas.Series(["a", "b", "c"], dtype="string[python]"), + "object": pandas.Series(["a", None, "c"], dtype="object"), + "arrow": pandas.Series(["a", None, "c"], dtype="string[pyarrow]"), + } + ), + id="strings", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for STRUCT? How to tell apart from JSON? + "arrow": pandas.Series( + [{"a": 1, "b": 1.0, "c": "c"}], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("a", pyarrow.int64()), + ("b", pyarrow.float64()), + ("c", pyarrow.string()), + ] + ) + ), + ), + } + ), + id="structs", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="times", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ).dt.tz_localize("UTC"), + "arrow": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us", "UTC")), + ), + } + ), + id="timestamps", + ), + ), +) +def test_read_gbq_colab_sessionless_dry_run_generates_valid_sql_for_local_dataframe( + df_pd: pandas.DataFrame, +): + # This method will fail with an exception if it receives invalid SQL. + result = module_under_test._run_read_gbq_colab_sessionless_dry_run( + query="SELECT * FROM {df_pd}", + pyformat_args={"df_pd": df_pd}, + ) + assert isinstance(result, pandas.Series) diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 05110d8485..447ce37766 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -19,13 +19,19 @@ from __future__ import annotations +import datetime import decimal from typing import Any, Dict, List +import db_dtypes # type: ignore +import geopandas # type: ignore import google.cloud.bigquery import google.cloud.bigquery.table +import numpy import pandas +import pyarrow import pytest +import shapely.geometry # type: ignore from bigframes.core import pyformat from bigframes.testing import mocks @@ -91,42 +97,313 @@ def test_pyformat_with_no_variables(session): pytest.param( # Empty columns default to floating point, just like pandas. pandas.DataFrame({"empty column": []}), - "STRUCT<`empty column` FLOAT>", + "STRUCT<`empty column` FLOAT64>", id="empty column", ), + # Regression tests for b/428190014. + # + # Test every BigQuery type we support, especially those where the legacy + # SQL type name differs from the GoogleSQL type name. + # + # See: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # and compare to the legacy types at + # https://cloud.google.com/bigquery/docs/data-types + # + # Test these against the real BigQuery dry run API in + # tests/system/small/pandas/io/api/test_read_gbq_colab.py pytest.param( pandas.DataFrame( { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [ - decimal.Decimal(1), - decimal.Decimal(2), - decimal.Decimal(3), - ], + "ints": pandas.Series( + [[1], [2], [3]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())), + ), + "floats": pandas.Series( + [[1.0], [2.0], [3.0]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), } ), - "STRUCT<`col1` INTEGER, `col2` STRING, `col3` NUMERIC>", - id="scalars", + "STRUCT<`ints` ARRAY, `floats` ARRAY>", + id="arrays", ), pytest.param( pandas.DataFrame( - {"array col": [[1, 2, 3]], "another array": [["a", "b", "c"]]} + { + "bool": pandas.Series([True, False, True], dtype="bool"), + "boolean": pandas.Series([True, None, True], dtype="boolean"), + "object": pandas.Series([True, None, True], dtype="object"), + "arrow": pandas.Series( + [True, None, True], dtype=pandas.ArrowDtype(pyarrow.bool_()) + ), + } ), - "STRUCT<`array col` ARRAY, `another array` ARRAY>", - id="arrays", + "STRUCT<`bool` BOOL, `boolean` BOOL, `object` BOOL, `arrow` BOOL>", + id="bools", ), pytest.param( pandas.DataFrame( { - "struct col": [ - {"subfield": {"subsubfield": 1}, "subfield2": 2}, - ], + "bytes": pandas.Series([b"a", b"b", b"c"], dtype=numpy.bytes_), + "object": pandas.Series([b"a", None, b"c"], dtype="object"), + "arrow": pandas.Series( + [b"a", None, b"c"], dtype=pandas.ArrowDtype(pyarrow.binary()) + ), + } + ), + "STRUCT<`bytes` BYTES, `object` BYTES, `arrow` BYTES>", + id="bytes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + } + ), + "STRUCT<`object` DATE, `arrow` DATE>", + id="dates", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ), + "arrow": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + } + ), + "STRUCT<`object` DATETIME, `datetime64` DATETIME, `arrow` DATETIME>", + id="datetimes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ], + dtype="object", + ), + "geopandas": geopandas.GeoSeries( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ] + ), + } + ), + "STRUCT<`object` GEOGRAPHY, `geopandas` GEOGRAPHY>", + id="geographys", + ), + # TODO(tswast): Add INTERVAL once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Is there an equivalent object type we can use here? + # TODO(tswast): Add built-in Arrow extension type + "db_dtypes": pandas.Series( + ["{}", None, "123"], + dtype=pandas.ArrowDtype(db_dtypes.JSONArrowType()), + ), + } + ), + "STRUCT<`db_dtypes` JSON>", + id="jsons", + ), + pytest.param( + pandas.DataFrame( + { + "int64": pandas.Series([1, 2, 3], dtype="int64"), + "Int64": pandas.Series([1, None, 3], dtype="Int64"), + "object": pandas.Series([1, None, 3], dtype="object"), + "arrow": pandas.Series( + [1, None, 3], dtype=pandas.ArrowDtype(pyarrow.int64()) + ), + } + ), + "STRUCT<`int64` INT64, `Int64` INT64, `object` INT64, `arrow` INT64>", + id="ints", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype="object", + ), + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal128(38, 9)), + ), + } + ), + "STRUCT<`object` NUMERIC, `arrow` NUMERIC>", + id="numerics", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for BIGNUMERIC. Can bigframes disambiguate? + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal256(76, 38)), + ), + } + ), + "STRUCT<`arrow` BIGNUMERIC>", + id="bignumerics", + ), + pytest.param( + pandas.DataFrame( + { + "float64": pandas.Series([1.23, None, 4.56], dtype="float64"), + "Float64": pandas.Series([1.23, None, 4.56], dtype="Float64"), + "object": pandas.Series([1.23, None, 4.56], dtype="object"), + "arrow": pandas.Series( + [1.23, None, 4.56], dtype=pandas.ArrowDtype(pyarrow.float64()) + ), } ), - "STRUCT<`struct col` STRUCT<`subfield` STRUCT<`subsubfield` INTEGER>, `subfield2` INTEGER>>", + "STRUCT<`float64` FLOAT64, `Float64` FLOAT64, `object` FLOAT64, `arrow` FLOAT64>", + id="floats", + ), + # TODO(tswast): Add RANGE once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + "string": pandas.Series(["a", "b", "c"], dtype="string[python]"), + "object": pandas.Series(["a", None, "c"], dtype="object"), + "arrow": pandas.Series(["a", None, "c"], dtype="string[pyarrow]"), + } + ), + "STRUCT<`string` STRING, `object` STRING, `arrow` STRING>", + id="strings", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for STRUCT? How to tell apart from JSON? + "arrow": pandas.Series( + [{"a": 1, "b": 1.0, "c": "c"}], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("a", pyarrow.int64()), + ("b", pyarrow.float64()), + ("c", pyarrow.string()), + ] + ) + ), + ), + } + ), + "STRUCT<`arrow` STRUCT<`a` INT64, `b` FLOAT64, `c` STRING>>", id="structs", ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + "STRUCT<`object` TIME, `arrow` TIME>", + id="times", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ).dt.tz_localize("UTC"), + "arrow": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us", "UTC")), + ), + } + ), + "STRUCT<`object` TIMESTAMP, `datetime64` TIMESTAMP, `arrow` TIMESTAMP>", + id="timestamps", + ), + # More complicated edge cases: pytest.param( pandas.DataFrame( { @@ -135,14 +412,14 @@ def test_pyformat_with_no_variables(session): ], } ), - "STRUCT<`array of struct col` ARRAY, `subfield2` INTEGER>>>", + "STRUCT<`array of struct col` ARRAY, `subfield2` INT64>>>", id="array_of_structs", ), pytest.param( pandas.DataFrame({"c1": [1, 2, 3], "c2": ["a", "b", "c"]}).rename( columns={"c1": "c", "c2": "c"} ), - "STRUCT<`c` INTEGER, `c_1` STRING>", + "STRUCT<`c` INT64, `c_1` STRING>", id="duplicate_column_names", ), ), diff --git a/tests/unit/core/tools/test_bigquery_schema.py b/tests/unit/core/tools/test_bigquery_schema.py index a5b0087801..aed8ae0323 100644 --- a/tests/unit/core/tools/test_bigquery_schema.py +++ b/tests/unit/core/tools/test_bigquery_schema.py @@ -9,9 +9,11 @@ "field, expected_sql", [ # Simple types - (bigquery.SchemaField("test_field", "INTEGER"), "INTEGER"), + # Note: the REST API will return Legacy SQL data types, but we need to + # map to GoogleSQL. See internal issue b/428190014. + (bigquery.SchemaField("test_field", "INTEGER"), "INT64"), (bigquery.SchemaField("test_field", "STRING"), "STRING"), - (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOLEAN"), + (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOL"), # RECORD/STRUCT types with nested fields directly ( bigquery.SchemaField( @@ -30,7 +32,7 @@ bigquery.SchemaField("another", "BOOLEAN"), ), ), - "STRUCT<`sub_field` INTEGER, `another` BOOLEAN>", + "STRUCT<`sub_field` INT64, `another` BOOL>", ), # Array is handled by _field_to_sql, instead. (bigquery.SchemaField("test_field", "NUMERIC", mode="REPEATED"), "NUMERIC"), @@ -54,7 +56,9 @@ def test_type_to_sql(field, expected_sql): "field, expected_sql", [ # Simple field - (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INTEGER"), + # Note: the REST API will return Legacy SQL data types, but we need to + # map to GoogleSQL. See internal issue b/428190014. + (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INT64"), (bigquery.SchemaField("name", "STRING", "NULLABLE"), "`name` STRING"), # Repeated field (bigquery.SchemaField("tags", "STRING", "REPEATED"), "`tags` ARRAY"), @@ -69,7 +73,7 @@ def test_type_to_sql(field, expected_sql): bigquery.SchemaField("zip", "INTEGER"), ), ), - "`addresses` ARRAY>", + "`addresses` ARRAY>", ), # Simple STRUCT ( @@ -82,7 +86,7 @@ def test_type_to_sql(field, expected_sql): bigquery.SchemaField("city", "STRING"), ), ), - "`person` STRUCT<`age` INTEGER, `city` STRING>", + "`person` STRUCT<`age` INT64, `city` STRING>", ), ], ) @@ -102,7 +106,7 @@ def test_field_to_sql(field, expected_sql): bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("name", "STRING"), ), - "STRUCT<`id` INTEGER, `name` STRING>", + "STRUCT<`id` INT64, `name` STRING>", ), # Nested RECORD/STRUCT ( @@ -118,7 +122,7 @@ def test_field_to_sql(field, expected_sql): ), ), ), - "STRUCT<`item_id` INTEGER, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", + "STRUCT<`item_id` INT64, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", ), # Repeated field ( @@ -143,7 +147,7 @@ def test_field_to_sql(field, expected_sql): ), bigquery.SchemaField("timestamp", "TIMESTAMP"), ), - "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", + "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", ), ], ) @@ -163,7 +167,7 @@ def test_to_struct(bqschema, expected_sql): bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("name", "STRING"), ), - "UNNEST(ARRAY>[])", + "UNNEST(ARRAY>[])", ), # Complex schema with nested and repeated fields ( @@ -179,7 +183,7 @@ def test_to_struct(bqschema, expected_sql): ), ), ), - "UNNEST(ARRAY>>>[])", + "UNNEST(ARRAY>>>[])", ), ], )