From 5004d08c6af93471686ccb319c69cd38c7893042 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 12 Dec 2024 15:38:50 -0600 Subject: [PATCH 1/4] feat: `to_gbq` fails with `TypeError` if passing in a bigframes DataFrame object (#833) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: `to_gbq` fails with `TypeError` if passing in a bigframes DataFrame object * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- pandas_gbq/gbq.py | 8 ++++++++ tests/unit/test_to_gbq.py | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index b04ad131..feffd858 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1091,6 +1091,14 @@ def to_gbq( .. versionadded:: 0.23.3 """ + # If we get a bigframes.pandas.DataFrame object, it may be possible to use + # the code paths here, but it could potentially be quite expensive because + # of the queries involved in type detection. It would be safer just to + # fail early if there are bigframes-y methods available. + # https://github.com/googleapis/python-bigquery-pandas/issues/824 + if hasattr(dataframe, "to_pandas") and hasattr(dataframe, "to_gbq"): + raise TypeError(f"Expected a pandas.DataFrame, but got {repr(type(dataframe))}") + _test_google_api_imports() from google.api_core import exceptions as google_exceptions diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py index 60ea8025..f4012dc8 100644 --- a/tests/unit/test_to_gbq.py +++ b/tests/unit/test_to_gbq.py @@ -11,6 +11,16 @@ from pandas_gbq import gbq +class FakeDataFrame: + """A fake bigframes DataFrame to avoid depending on bigframes.""" + + def to_gbq(self): + """Fake to_gbq() to mimic a bigframes object.""" + + def to_pandas(self): + """Fake to_pandas() to mimic a bigframes object.""" + + @pytest.fixture def expected_load_method(mock_bigquery_client): return mock_bigquery_client.load_table_from_dataframe @@ -66,6 +76,15 @@ def test_to_gbq_load_method_translates_exception( expected_load_method.assert_called_once() +def test_to_gbq_with_bigframes_raises_typeerror(): + dataframe = FakeDataFrame() + + with pytest.raises( + TypeError, match=r"Expected a pandas.DataFrame, but got .+FakeDataFrame" + ): + gbq.to_gbq(dataframe, "my_dataset.my_table", project_id="myproj") + + def test_to_gbq_with_if_exists_append(mock_bigquery_client, expected_load_method): from google.cloud.bigquery import SchemaField From 5484a8c69965549e36afa4388ff42be3f83ec097 Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:41:50 +0800 Subject: [PATCH 2/4] chore(python): update dependencies in .kokoro/docker/docs (#841) Source-Link: https://github.com/googleapis/synthtool/commit/e808c98e1ab7eec3df2a95a05331619f7001daef Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:8e3e7e18255c22d1489258d0374c901c01f9c4fd77a12088670cd73d580aa737 Co-authored-by: Owl Bot --- .github/.OwlBot.lock.yaml | 4 +-- .kokoro/docker/docs/requirements.txt | 52 ++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 6301519a..26306af6 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:2ed982f884312e4883e01b5ab8af8b6935f0216a5a2d82928d273081fc3be562 -# created: 2024-11-12T12:09:45.821174897Z + digest: sha256:8e3e7e18255c22d1489258d0374c901c01f9c4fd77a12088670cd73d580aa737 +# created: 2024-12-17T00:59:58.625514486Z diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt index 8bb07645..f99a5c4a 100644 --- a/.kokoro/docker/docs/requirements.txt +++ b/.kokoro/docker/docs/requirements.txt @@ -2,11 +2,11 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --allow-unsafe --generate-hashes requirements.in +# pip-compile --allow-unsafe --generate-hashes synthtool/gcp/templates/python_library/.kokoro/docker/docs/requirements.in # -argcomplete==3.5.1 \ - --hash=sha256:1a1d148bdaa3e3b93454900163403df41448a248af01b6e849edc5ac08e6c363 \ - --hash=sha256:eb1ee355aa2557bd3d0145de7b06b2a45b0ce461e1e7813f5d066039ab4177b4 +argcomplete==3.5.2 \ + --hash=sha256:036d020d79048a5d525bc63880d7a4b8d1668566b8a76daf1144c0bbe0f63472 \ + --hash=sha256:23146ed7ac4403b70bd6026402468942ceba34a6732255b9edf5b7354f68a6bb # via nox colorlog==6.9.0 \ --hash=sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff \ @@ -23,7 +23,7 @@ filelock==3.16.1 \ nox==2024.10.9 \ --hash=sha256:1d36f309a0a2a853e9bccb76bbef6bb118ba92fa92674d15604ca99adeb29eab \ --hash=sha256:7aa9dc8d1c27e9f45ab046ffd1c3b2c4f7c91755304769df231308849ebded95 - # via -r requirements.in + # via -r synthtool/gcp/templates/python_library/.kokoro/docker/docs/requirements.in packaging==24.2 \ --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \ --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f @@ -32,11 +32,41 @@ platformdirs==4.3.6 \ --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \ --hash=sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb # via virtualenv -tomli==2.0.2 \ - --hash=sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38 \ - --hash=sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed +tomli==2.2.1 \ + --hash=sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6 \ + --hash=sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd \ + --hash=sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c \ + --hash=sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b \ + --hash=sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8 \ + --hash=sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6 \ + --hash=sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77 \ + --hash=sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff \ + --hash=sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea \ + --hash=sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192 \ + --hash=sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249 \ + --hash=sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee \ + --hash=sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4 \ + --hash=sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98 \ + --hash=sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8 \ + --hash=sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4 \ + --hash=sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281 \ + --hash=sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744 \ + --hash=sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69 \ + --hash=sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13 \ + --hash=sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140 \ + --hash=sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e \ + --hash=sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e \ + --hash=sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc \ + --hash=sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff \ + --hash=sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec \ + --hash=sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2 \ + --hash=sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222 \ + --hash=sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106 \ + --hash=sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272 \ + --hash=sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a \ + --hash=sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7 # via nox -virtualenv==20.27.1 \ - --hash=sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba \ - --hash=sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4 +virtualenv==20.28.0 \ + --hash=sha256:23eae1b4516ecd610481eda647f3a7c09aea295055337331bb4e6892ecce47b0 \ + --hash=sha256:2c9c3262bb8e7b87ea801d715fae4495e6032450c71d2309be9550e7364049aa # via nox From cf1aadd48165617768fecff91e68941255148dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 19 Dec 2024 11:22:03 -0600 Subject: [PATCH 3/4] fix: `to_gbq` uses `default_type` for ambiguous array types and struct field types (#838) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: `to_gbq` uses `default_type` for ambiguous array types and struct field types * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix arrow list(null) case too * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * lint * Update pandas_gbq/schema/pandas_to_bigquery.py Co-authored-by: Chalmer Lowe * Update pandas_gbq/schema/pandas_to_bigquery.py Co-authored-by: Chalmer Lowe * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * remove redundant string check * Apply suggestions from code review Co-authored-by: Chalmer Lowe * add docstrings and a few more test cases * use python 3.10 for docs github action --------- Co-authored-by: Owl Bot Co-authored-by: Chalmer Lowe --- .github/workflows/docs.yml | 2 +- owlbot.py | 1 + pandas_gbq/schema/pandas_to_bigquery.py | 111 +++++++++++++++--- pandas_gbq/schema/pyarrow_to_bigquery.py | 61 +++++++++- tests/unit/schema/test_pandas_to_bigquery.py | 49 ++++++-- tests/unit/schema/test_pyarrow_to_bigquery.py | 18 ++- tests/unit/test_schema.py | 51 +++++++- 7 files changed, 244 insertions(+), 49 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 698fbc5c..2833fe98 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -12,7 +12,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.10" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/owlbot.py b/owlbot.py index 190298a6..e50b9e9e 100644 --- a/owlbot.py +++ b/owlbot.py @@ -57,6 +57,7 @@ "noxfile.py", "README.rst", # exclude this file as we have an alternate prerelease.cfg + ".github/workflows/docs.yml", ".kokoro/presubmit/prerelease-deps.cfg", ".kokoro/presubmit/presubmit.cfg", ], diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py index 5a979a12..5afae356 100644 --- a/pandas_gbq/schema/pandas_to_bigquery.py +++ b/pandas_gbq/schema/pandas_to_bigquery.py @@ -4,7 +4,7 @@ import collections.abc import datetime -from typing import Optional, Tuple +from typing import Any, Optional, Tuple import warnings import db_dtypes @@ -28,14 +28,21 @@ # `docs/source/writing.rst`. _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", + "boolean": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", + "datetime64[us, UTC]": "TIMESTAMP", "datetime64[ns]": "DATETIME", + "datetime64[us]": "DATETIME", "float32": "FLOAT", "float64": "FLOAT", "int8": "INTEGER", "int16": "INTEGER", "int32": "INTEGER", "int64": "INTEGER", + "Int8": "INTEGER", + "Int16": "INTEGER", + "Int32": "INTEGER", + "Int64": "INTEGER", "uint8": "INTEGER", "uint16": "INTEGER", "uint32": "INTEGER", @@ -103,7 +110,7 @@ def dataframe_to_bigquery_fields( # Try to automatically determine the type based on a few rows of the data. values = dataframe.reset_index()[column] - bq_field = values_to_bigquery_field(column, values) + bq_field = values_to_bigquery_field(column, values, default_type=default_type) if bq_field: bq_schema_out.append(bq_field) @@ -114,7 +121,9 @@ def dataframe_to_bigquery_fields( arrow_value = pyarrow.array(values) bq_field = ( pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( - column, arrow_value.type + column, + arrow_value.type, + default_type=default_type, ) ) @@ -151,6 +160,19 @@ def dataframe_to_bigquery_fields( def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from a pandas dtype. + + Args: + name (str): + Name of the column/field. + dtype: + A pandas / numpy dtype object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is ambiguous like the object dtype. + """ bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if bq_type is not None: @@ -164,9 +186,44 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: return None -def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: - if isinstance(value, str): - return schema.SchemaField(name, "STRING") +def value_to_bigquery_field( + name: str, value: Any, default_type: Optional[str] = None +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from a single value. + + Args: + name: + The name of the field. + value: + The value to infer the type from. If None, the default type is used + if available. + default_type: + The default field type. Defaults to None. + + Returns: + The schema field, or None if a type cannot be inferred. + """ + + # Set the SchemaField datatype to the given default_type if the value + # being assessed is None. + if value is None: + return schema.SchemaField(name, default_type) + + # Map from Python types to BigQuery types. This isn't super exhaustive + # because we rely more on pyarrow, which can check more than one value to + # determine the type. + type_mapping = { + str: "STRING", + } + + # geopandas and shapely are optional dependencies, so only check if those + # are installed. + if _BaseGeometry is not None: + type_mapping[_BaseGeometry] = "GEOGRAPHY" + + for type_, bq_type in type_mapping.items(): + if isinstance(value, type_): + return schema.SchemaField(name, bq_type) # For timezone-naive datetimes, the later pyarrow conversion to try and # learn the type add a timezone to such datetimes, causing them to be @@ -182,35 +239,51 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: else: return schema.SchemaField(name, "DATETIME") - if _BaseGeometry is not None and isinstance(value, _BaseGeometry): - return schema.SchemaField(name, "GEOGRAPHY") - return None -def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]: +def values_to_bigquery_field( + name: str, values: Any, default_type: str = "STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from a list of values. + + This function iterates through the given values to determine the + corresponding schema field type. + + Args: + name: + The name of the field. + values: + An iterable of values to infer the type from. If all the values + are None or the iterable is empty, the function returns None. + default_type: + The default field type to use if a specific type cannot be + determined from the values. Defaults to "STRING". + + Returns: + The schema field, or None if a type cannot be inferred. + """ value = pandas_gbq.core.pandas.first_valid(values) - # All NULL, type not determinable. + # All values came back as NULL, thus type not determinable by this method. + # Return None so we can try other methods. if value is None: return None - field = value_to_bigquery_field(name, value) - if field is not None: + field = value_to_bigquery_field(name, value, default_type=default_type) + if field: return field - if isinstance(value, str): - return schema.SchemaField(name, "STRING") - - # Check plain ARRAY values here. Let STRUCT get determined by pyarrow, - # which can examine more values to determine all keys. + # Check plain ARRAY values here. Exclude mapping types to let STRUCT get + # determined by pyarrow, which can examine more values to determine all + # keys. if isinstance(value, collections.abc.Iterable) and not isinstance( value, collections.abc.Mapping ): # It could be that this value contains all None or is empty, so get the # first non-None value we can find. valid_item = pandas_gbq.core.pandas.first_array_valid(values) - field = value_to_bigquery_field(name, valid_item) + field = value_to_bigquery_field(name, valid_item, default_type=default_type) if field is not None: return schema.SchemaField(name, field.field_type, mode="REPEATED") diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py index da1a1ce8..91677f9d 100644 --- a/pandas_gbq/schema/pyarrow_to_bigquery.py +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -37,7 +37,31 @@ } -def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: +def arrow_type_to_bigquery_field( + name, type_, default_type="STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from an arrow type. + + Args: + name (str): + Name of the column/field. + type_: + A pyarrow type object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is a type that doesn't have a clear mapping in BigQuery. + + null() are assumed to be the ``default_type``, since there are no + values that contradict that. + """ + # If a sub-field is the null type, then assume it's the default type, as + # that's the best we can do. + # https://github.com/googleapis/python-bigquery-pandas/issues/836 + if pyarrow.types.is_null(type_): + return schema.SchemaField(name, default_type) + # Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use # a special case to disambiguate them. See: # https://github.com/googleapis/python-bigquery-pandas/issues/450 @@ -52,22 +76,49 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: return schema.SchemaField(name, detected_type) if pyarrow.types.is_list(type_): - return arrow_list_type_to_bigquery(name, type_) + return arrow_list_type_to_bigquery(name, type_, default_type=default_type) if pyarrow.types.is_struct(type_): inner_fields: list[pyarrow.Field] = [] struct_type = cast(pyarrow.StructType, type_) for field_index in range(struct_type.num_fields): field = struct_type[field_index] - inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type)) + inner_fields.append( + arrow_type_to_bigquery_field( + field.name, field.type, default_type=default_type + ) + ) return schema.SchemaField(name, "RECORD", fields=inner_fields) return None -def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]: - inner_field = arrow_type_to_bigquery_field(name, type_.value_type) +def arrow_list_type_to_bigquery( + name, type_, default_type="STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from an arrow list type. + + Args: + name (str): + Name of the column/field. + type_: + A pyarrow type object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is a type that doesn't have a clear mapping in BigQuery. + + null() are assumed to be the ``default_type``, since there are no + values that contradict that. + """ + inner_field = arrow_type_to_bigquery_field( + name, type_.value_type, default_type=default_type + ) + + # If this is None, it means we got some type that we can't cleanly map to + # a BigQuery type, so bubble that status up. if inner_field is None: return None diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py index 924ce1ee..777c3825 100644 --- a/tests/unit/schema/test_pandas_to_bigquery.py +++ b/tests/unit/schema/test_pandas_to_bigquery.py @@ -21,13 +21,34 @@ def module_under_test(): def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): df_data = collections.OrderedDict( [ + ("str_index", ["a", "b"]), ("str_column", ["hello", "world"]), ("int_column", [42, 8]), + ("nullable_int_column", pandas.Series([42, None], dtype="Int64")), + ("uint_column", pandas.Series([7, 13], dtype="uint8")), ("bool_column", [True, False]), + ("boolean_column", pandas.Series([True, None], dtype="boolean")), + ( + "datetime_column", + [ + datetime.datetime(1999, 12, 31, 23, 59, 59, 999999), + datetime.datetime(2000, 1, 1, 0, 0, 0), + ], + ), + ( + "timestamp_column", + [ + datetime.datetime( + 1999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2000, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + ), ] ) - index = pandas.Index(["a", "b"], name="str_index") - dataframe = pandas.DataFrame(df_data, index=index) + dataframe = pandas.DataFrame(df_data).set_index("str_index", drop=True) returned_schema = module_under_test.dataframe_to_bigquery_fields( dataframe, [], index=True @@ -37,7 +58,12 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): schema.SchemaField("str_index", "STRING", "NULLABLE"), schema.SchemaField("str_column", "STRING", "NULLABLE"), schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("nullable_int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("uint_column", "INTEGER", "NULLABLE"), schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"), + schema.SchemaField("boolean_column", "BOOLEAN", "NULLABLE"), + schema.SchemaField("datetime_column", "DATETIME", "NULLABLE"), + schema.SchemaField("timestamp_column", "TIMESTAMP", "NULLABLE"), ) assert returned_schema == expected_schema @@ -45,19 +71,24 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): def test_dataframe_to_bigquery_fields_w_multiindex(module_under_test): df_data = collections.OrderedDict( [ + ("str_index", ["a", "a"]), + ("int_index", [0, 0]), + ( + "dt_index", + [ + datetime.datetime(1999, 12, 31, 23, 59, 59, 999999), + datetime.datetime(2000, 1, 1, 0, 0, 0), + ], + ), ("str_column", ["hello", "world"]), ("int_column", [42, 8]), ("bool_column", [True, False]), ] ) - index = pandas.MultiIndex.from_tuples( - [ - ("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)), - ("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)), - ], - names=["str_index", "int_index", "dt_index"], + dataframe = pandas.DataFrame(df_data).set_index( + ["str_index", "int_index", "dt_index"], + drop=True, ) - dataframe = pandas.DataFrame(df_data, index=index) returned_schema = module_under_test.dataframe_to_bigquery_fields( dataframe, [], index=True diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py index 4af0760f..dc5504f9 100644 --- a/tests/unit/schema/test_pyarrow_to_bigquery.py +++ b/tests/unit/schema/test_pyarrow_to_bigquery.py @@ -42,16 +42,14 @@ def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type): def test_arrow_type_to_bigquery_field_unknown(): - assert ( - pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null()) - is None - ) + assert pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", pyarrow.null(), default_type="DEFAULT_TYPE" + ) == bigquery.SchemaField("test_name", "DEFAULT_TYPE") def test_arrow_type_to_bigquery_field_list_of_unknown(): - assert ( - pyarrow_to_bigquery.arrow_type_to_bigquery_field( - "test_name", pyarrow.list_(pyarrow.null()) - ) - is None - ) + assert pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", + pyarrow.list_(pyarrow.null()), + default_type="DEFAULT_TYPE", + ) == bigquery.SchemaField("test_name", "DEFAULT_TYPE", mode="REPEATED") diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 48e8862a..0da16baf 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -70,7 +70,7 @@ def test_schema_is_subset_fails_if_not_subset(): [ pytest.param( pandas.DataFrame(data={"col1": [object()]}), - {"fields": [{"name": "col1", "type": "STRING"}]}, + {"fields": [{"name": "col1", "type": "DEFAULT_TYPE"}]}, id="default-type-fails-pyarrow-conversion", ), ( @@ -182,13 +182,15 @@ def test_schema_is_subset_fails_if_not_subset(): else "object", ), "list_of_struct": pandas.Series( - [[], [{"test": "abc"}], []], + [[], [{"test": 123.0}], []], dtype=pandas.ArrowDtype( - pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])) + pyarrow.list_(pyarrow.struct([("test", pyarrow.float64())])) ) if hasattr(pandas, "ArrowDtype") else "object", ), + "list_of_unknown": [[], [], []], + "list_of_null": [[None, None], [None], [None, None]], } ), { @@ -200,17 +202,56 @@ def test_schema_is_subset_fails_if_not_subset(): "type": "RECORD", "mode": "REPEATED", "fields": [ - {"name": "test", "type": "STRING", "mode": "NULLABLE"}, + {"name": "test", "type": "FLOAT", "mode": "NULLABLE"}, ], }, + # Use DEFAULT_TYPE because there are no values to detect a type. + { + "name": "list_of_unknown", + "type": "DEFAULT_TYPE", + "mode": "REPEATED", + }, + { + "name": "list_of_null", + "type": "DEFAULT_TYPE", + "mode": "REPEATED", + }, ], }, id="array", ), + pytest.param( + # If a struct contains only nulls in a sub-field, use the default + # type for subfields without a type we can determine. + # https://github.com/googleapis/python-bigquery-pandas/issues/836 + pandas.DataFrame( + { + "id": [0, 1], + "positions": [{"state": None}, {"state": None}], + }, + ), + { + "fields": [ + {"name": "id", "type": "INTEGER"}, + { + "name": "positions", + "type": "RECORD", + "fields": [ + { + "name": "state", + "type": "DEFAULT_TYPE", + "mode": "NULLABLE", + }, + ], + }, + ], + }, + id="issue832-null-struct-field", + ), ], ) def test_generate_bq_schema(dataframe, expected_schema): - schema = pandas_gbq.gbq._generate_bq_schema(dataframe) + schema = pandas_gbq.gbq._generate_bq_schema(dataframe, default_type="DEFAULT_TYPE") # NULLABLE is the default mode. for field in expected_schema["fields"]: From b32a9c98ec717573ffe45b51ce834a3903df8bc1 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Thu, 19 Dec 2024 11:48:05 -0600 Subject: [PATCH 4/4] chore(main): release 0.26.0 (#837) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 12 ++++++++++++ pandas_gbq/version.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcf55cd4..41b4c8df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## [0.26.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.25.0...v0.26.0) (2024-12-19) + + +### Features + +* `to_gbq` fails with `TypeError` if passing in a bigframes DataFrame object ([#833](https://github.com/googleapis/python-bigquery-pandas/issues/833)) ([5004d08](https://github.com/googleapis/python-bigquery-pandas/commit/5004d08c6af93471686ccb319c69cd38c7893042)) + + +### Bug Fixes + +* `to_gbq` uses `default_type` for ambiguous array types and struct field types ([#838](https://github.com/googleapis/python-bigquery-pandas/issues/838)) ([cf1aadd](https://github.com/googleapis/python-bigquery-pandas/commit/cf1aadd48165617768fecff91e68941255148dbd)) + ## [0.25.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.24.0...v0.25.0) (2024-12-11) diff --git a/pandas_gbq/version.py b/pandas_gbq/version.py index 478b8136..0c8dab15 100644 --- a/pandas_gbq/version.py +++ b/pandas_gbq/version.py @@ -2,4 +2,4 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -__version__ = "0.25.0" +__version__ = "0.26.0"