From a3a8cb39b79a9d17b7512c55d863b240552320dc Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Mon, 25 Oct 2021 16:31:36 -0400 Subject: [PATCH 1/6] chore(python): push cloud library docs to staging bucket for Cloud RAD (#35) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(python): push cloud library docs to staging bucket for Cloud RAD Source-Link: https://github.com/googleapis/synthtool/commit/7fd61f8efae782a7cfcecc599faf52f9737fe584 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:4ee57a76a176ede9087c14330c625a71553cf9c72828b2c0ca12f5338171ba60 * update replacement in owlbot.py * 🦉 Updates from OwlBot See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou --- .github/.OwlBot.lock.yaml | 2 +- .kokoro/docs/common.cfg | 4 +++- owlbot.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 7d98291..108063d 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -1,3 +1,3 @@ docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:58f73ba196b5414782605236dd0712a73541b44ff2ff4d3a36ec41092dd6fa5b + digest: sha256:4ee57a76a176ede9087c14330c625a71553cf9c72828b2c0ca12f5338171ba60 diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index abd7ad7..c790a9a 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -30,7 +30,9 @@ env_vars: { env_vars: { key: "V2_STAGING_BUCKET" - value: "docs-staging-v2" + # Push non-cloud library docs to `docs-staging-v2-staging` instead of the + # Cloud RAD bucket `docs-staging-v2` + value: "docs-staging-v2-staging" } # It will upload the docker image after successful builds. diff --git a/owlbot.py b/owlbot.py index e6c264c..eae82e9 100644 --- a/owlbot.py +++ b/owlbot.py @@ -50,7 +50,7 @@ ) s.replace( - ["noxfile.py"], "google/cloud", "db_dtypes", + ["noxfile.py"], "--cov=google", "--cov=db_dtypes", ) # There are no system tests for this package. From 9209c0ec34c208ecf24f9a04b8fbb7f0cca6730f Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Mon, 25 Oct 2021 21:28:47 -0400 Subject: [PATCH 2/6] chore(python): omit google/__init__.py in coverage (#36) Source-Link: https://github.com/googleapis/synthtool/commit/694118b039b09551fb5d445fceb361a7dbb06400 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:ec49167c606648a063d1222220b48119c912562849a0528f35bfb592a9f72737 Co-authored-by: Owl Bot --- .coveragerc | 1 + .github/.OwlBot.lock.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index ad02e9b..0f8f905 100644 --- a/.coveragerc +++ b/.coveragerc @@ -18,6 +18,7 @@ [run] branch = True omit = + google/__init__.py db_dtypes/requirements.py [report] diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 108063d..cb89b2e 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -1,3 +1,3 @@ docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:4ee57a76a176ede9087c14330c625a71553cf9c72828b2c0ca12f5338171ba60 + digest: sha256:ec49167c606648a063d1222220b48119c912562849a0528f35bfb592a9f72737 From 9fd806af3fbe5cc7a592fbc4bb19fe72dee082a5 Mon Sep 17 00:00:00 2001 From: WhiteSource Renovate Date: Tue, 26 Oct 2021 23:00:17 +0200 Subject: [PATCH 3/6] chore(deps): update dependency pyarrow to v6 (#37) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [![WhiteSource Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: | Package | Change | Age | Adoption | Passing | Confidence | |---|---|---|---|---|---| | [pyarrow](https://arrow.apache.org/) | ` >= 3.0.0, < 6.0dev` -> `>=3.0.0, <6.1` | [![age](https://badges.renovateapi.com/packages/pypi/pyarrow/6.0.0/age-slim)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://badges.renovateapi.com/packages/pypi/pyarrow/6.0.0/adoption-slim)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://badges.renovateapi.com/packages/pypi/pyarrow/6.0.0/compatibility-slim/5.0.0)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://badges.renovateapi.com/packages/pypi/pyarrow/6.0.0/confidence-slim/5.0.0)](https://docs.renovatebot.com/merge-confidence/) | --- ### Configuration 📅 **Schedule**: At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, click this checkbox. --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#github/googleapis/python-db-dtypes-pandas). --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e1e355..19377e5 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ dependencies = [ "packaging >= 17.0", "pandas >= 0.24.2, < 2.0dev", - "pyarrow >= 3.0.0, < 6.0dev", + "pyarrow>=3.0.0, <7.0dev", "numpy >= 1.16.6, < 2.0dev", ] From a31d55db57b2f5655b1fee4230a930d5bee4b1c9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 4 Nov 2021 11:18:30 -0500 Subject: [PATCH 4/6] feat: support Python 3.10 (#40) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: support Python 3.10 * 🦉 Updates from OwlBot See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot --- CONTRIBUTING.rst | 6 ++++-- noxfile.py | 2 +- owlbot.py | 2 +- setup.py | 3 ++- testing/constraints-3.11.txt | 0 5 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 testing/constraints-3.11.txt diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index f066db3..22f6382 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.6, 3.7, 3.8 and 3.9 on both UNIX and Windows. + 3.6, 3.7, 3.8, 3.9 and 3.10 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.9 -- -k + $ nox -s unit-3.10 -- -k .. note:: @@ -225,11 +225,13 @@ We support: - `Python 3.7`_ - `Python 3.8`_ - `Python 3.9`_ +- `Python 3.10`_ .. _Python 3.6: https://docs.python.org/3.6/ .. _Python 3.7: https://docs.python.org/3.7/ .. _Python 3.8: https://docs.python.org/3.8/ .. _Python 3.9: https://docs.python.org/3.9/ +.. _Python 3.10: https://docs.python.org/3.10/ Supported versions can be found in our ``noxfile.py`` `config`_. diff --git a/noxfile.py b/noxfile.py index f2a2bed..5f48361 100644 --- a/noxfile.py +++ b/noxfile.py @@ -29,7 +29,7 @@ DEFAULT_PYTHON_VERSION = "3.8" SYSTEM_TEST_PYTHON_VERSIONS = ["3.8"] -UNIT_TEST_PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] +UNIT_TEST_PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() diff --git a/owlbot.py b/owlbot.py index eae82e9..30f3b3d 100644 --- a/owlbot.py +++ b/owlbot.py @@ -28,7 +28,7 @@ # Add templated files # ---------------------------------------------------------------------------- templated_files = common.py_library( - unit_test_python_versions=["3.6", "3.7", "3.8", "3.9"], + unit_test_python_versions=["3.6", "3.7", "3.8", "3.9", "3.10"], system_test_python_versions=["3.8"], cov_level=100, intersphinx_dependencies={ diff --git a/setup.py b/setup.py index 19377e5..8def678 100644 --- a/setup.py +++ b/setup.py @@ -66,11 +66,12 @@ def readme(): "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Operating System :: OS Independent", "Topic :: Database :: Front-Ends", ], platforms="Posix; MacOS X; Windows", install_requires=dependencies, - python_requires=">=3.6, <3.10", + python_requires=">=3.6, <3.11", tests_require=["pytest"], ) diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt new file mode 100644 index 0000000..e69de29 From facc7b0897e27c5ba99399b7d453818c5b4aeca7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 8 Nov 2021 16:48:53 -0600 Subject: [PATCH 5/6] feat: support conversion from pyarrow RecordBatch to pandas DataFrame (#39) * feat: support conversion from pyarrow RecordBatch to pandas DataFrame * hack together working implementation TODO: add tests for constructing pandas Series with pyarrow scalars * fix unit test coverage, optimize arrow to numpy conversion * apply same optimizations to to_arrow conversion * remove redundant to_numpy now that to_arrow doesn't use it * be explicit about chunked array vs array * add docstrings to arrow conversion functions * add test case for round-trip to/from pyarrow nanosecond-precision time scalars * add time32("ms") test case without nulls for completeness --- db_dtypes/__init__.py | 109 +++++++++-- db_dtypes/core.py | 8 +- tests/unit/test_arrow.py | 404 +++++++++++++++++++++++++++------------ 3 files changed, 375 insertions(+), 146 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index bce2bf0..f1424fb 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -17,6 +17,7 @@ import datetime import re +from typing import Union import numpy import packaging.version @@ -29,6 +30,7 @@ import pandas.core.dtypes.generic import pandas.core.nanops import pyarrow +import pyarrow.compute from db_dtypes.version import __version__ from db_dtypes import core @@ -36,6 +38,8 @@ date_dtype_name = "dbdate" time_dtype_name = "dbtime" +_EPOCH = datetime.datetime(1970, 1, 1) +_NPEPOCH = numpy.datetime64(_EPOCH) pandas_release = packaging.version.parse(pandas.__version__).release @@ -52,6 +56,33 @@ class TimeDtype(core.BaseDatetimeDtype): def construct_array_type(self): return TimeArray + @staticmethod + def __from_arrow__( + array: Union[pyarrow.Array, pyarrow.ChunkedArray] + ) -> "TimeArray": + """Convert to dbtime data from an Arrow array. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ + # We can't call combine_chunks on an empty array, so short-circuit the + # rest of the function logic for this special case. + if len(array) == 0: + return TimeArray(numpy.array([], dtype="datetime64[ns]")) + + # We can't cast to timestamp("ns"), but time64("ns") has the same + # memory layout: 64-bit integers representing the number of nanoseconds + # since the datetime epoch (midnight 1970-01-01). + array = pyarrow.compute.cast(array, pyarrow.time64("ns")) + + # ChunkedArray has no "view" method, so combine into an Array. + if isinstance(array, pyarrow.ChunkedArray): + array = array.combine_chunks() + + array = array.view(pyarrow.timestamp("ns")) + np_array = array.to_numpy(zero_copy_only=False) + return TimeArray(np_array) + class TimeArray(core.BaseDatetimeArray): """ @@ -61,8 +92,6 @@ class TimeArray(core.BaseDatetimeArray): # Data are stored as datetime64 values with a date of Jan 1, 1970 dtype = TimeDtype() - _epoch = datetime.datetime(1970, 1, 1) - _npepoch = numpy.datetime64(_epoch) @classmethod def _datetime( @@ -75,8 +104,21 @@ def _datetime( r"(?:\.(?P\d*))?)?)?\s*$" ).match, ): - if isinstance(scalar, datetime.time): - return datetime.datetime.combine(cls._epoch, scalar) + # Convert pyarrow values to datetime.time. + if isinstance(scalar, (pyarrow.Time32Scalar, pyarrow.Time64Scalar)): + scalar = ( + scalar.cast(pyarrow.time64("ns")) + .cast(pyarrow.int64()) + .cast(pyarrow.timestamp("ns")) + .as_py() + ) + + if scalar is None: + return None + elif isinstance(scalar, datetime.time): + return datetime.datetime.combine(_EPOCH, scalar) + elif isinstance(scalar, pandas.Timestamp): + return scalar.to_datetime64() elif isinstance(scalar, str): # iso string parsed = match_fn(scalar) @@ -113,7 +155,7 @@ def _box_func(self, x): __return_deltas = {"timedelta", "timedelta64", "timedelta64[ns]", " "DateArray": + """Convert to dbdate data from an Arrow array. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ + array = pyarrow.compute.cast(array, pyarrow.timestamp("ns")) + np_array = array.to_numpy() + return DateArray(np_array) + class DateArray(core.BaseDatetimeArray): """ @@ -161,7 +226,13 @@ def _datetime( scalar, match_fn=re.compile(r"\s*(?P\d+)-(?P\d+)-(?P\d+)\s*$").match, ): - if isinstance(scalar, datetime.date): + # Convert pyarrow values to datetime.date. + if isinstance(scalar, (pyarrow.Date32Scalar, pyarrow.Date64Scalar)): + scalar = scalar.as_py() + + if scalar is None: + return None + elif isinstance(scalar, datetime.date): return datetime.datetime(scalar.year, scalar.month, scalar.day) elif isinstance(scalar, str): match = match_fn(scalar) @@ -197,8 +268,14 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy=copy) def __arrow_array__(self, type=None): - return pyarrow.array( - self._ndarray, type=type if type is not None else pyarrow.date32(), + """Convert to an Arrow array from dbdate data. + + See: + https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + """ + array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns")) + return pyarrow.compute.cast( + array, type if type is not None else pyarrow.date32(), ) def __add__(self, other): @@ -206,7 +283,7 @@ def __add__(self, other): return self.astype("object") + other if isinstance(other, TimeArray): - return (other._ndarray - other._npepoch) + self._ndarray + return (other._ndarray - _NPEPOCH) + self._ndarray return super().__add__(other) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index fbc784e..c8f3ad4 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -17,6 +17,7 @@ import numpy import pandas from pandas._libs import NaT +import pandas.api.extensions import pandas.compat.numpy.function import pandas.core.algorithms import pandas.core.arrays @@ -32,7 +33,7 @@ pandas_release = pandas_backports.pandas_release -class BaseDatetimeDtype(pandas.core.dtypes.base.ExtensionDtype): +class BaseDatetimeDtype(pandas.api.extensions.ExtensionDtype): na_value = NaT kind = "o" names = None @@ -60,10 +61,7 @@ def __init__(self, values, dtype=None, copy: bool = False): @classmethod def __ndarray(cls, scalars): - return numpy.array( - [None if scalar is None else cls._datetime(scalar) for scalar in scalars], - "M8[ns]", - ) + return numpy.array([cls._datetime(scalar) for scalar in scalars], "M8[ns]",) @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py index d3745ea..5f45a90 100644 --- a/tests/unit/test_arrow.py +++ b/tests/unit/test_arrow.py @@ -13,160 +13,314 @@ # limitations under the License. import datetime as dt +from typing import Optional import pandas +import pandas.api.extensions +import pandas.testing import pyarrow import pytest -# To register the types. -import db_dtypes # noqa +import db_dtypes -@pytest.mark.parametrize( - ("series", "expected"), +SECOND_NANOS = 1_000_000_000 +MINUTE_NANOS = 60 * SECOND_NANOS +HOUR_NANOS = 60 * MINUTE_NANOS + + +def types_mapper( + pyarrow_type: pyarrow.DataType, +) -> Optional[pandas.api.extensions.ExtensionDtype]: + type_str = str(pyarrow_type) + + if type_str.startswith("date32") or type_str.startswith("date64"): + return db_dtypes.DateDtype + elif type_str.startswith("time32") or type_str.startswith("time64"): + return db_dtypes.TimeDtype + else: + # Use default type mapping. + return None + + +SERIES_ARRAYS_DEFAULT_TYPES = [ + (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())), ( - (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())), - ( - pandas.Series([None, None, None], dtype="dbdate"), - pyarrow.array([None, None, None], type=pyarrow.date32()), - ), - ( - pandas.Series( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" - ), - pyarrow.array( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], - type=pyarrow.date32(), - ), + pandas.Series([None, None, None], dtype="dbdate"), + pyarrow.array([None, None, None], type=pyarrow.date32()), + ), + ( + pandas.Series( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" ), - ( - pandas.Series( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - dtype="dbdate", - ), - pyarrow.array( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - type=pyarrow.date32(), - ), + pyarrow.array( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date32(), ), - ( - pandas.Series([], dtype="dbtime"), - pyarrow.array([], type=pyarrow.time64("ns")), + ), + ( + pandas.Series( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + dtype="dbdate", ), - ( - pandas.Series([None, None, None], dtype="dbtime"), - pyarrow.array([None, None, None], type=pyarrow.time64("ns")), + pyarrow.array( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + type=pyarrow.date32(), ), - ( - pandas.Series( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - dtype="dbtime", - ), - pyarrow.array( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - type=pyarrow.time64("ns"), - ), + ), + (pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time64("ns")),), + ( + pandas.Series([None, None, None], dtype="dbtime"), + pyarrow.array([None, None, None], type=pyarrow.time64("ns")), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime", ), - ( - pandas.Series( - [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), - dt.time(23, 59, 59, 999_999), - ], - dtype="dbtime", - ), - pyarrow.array( - [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), - dt.time(23, 59, 59, 999_999), - ], - type=pyarrow.time64("ns"), - ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], + type=pyarrow.time64("ns"), ), ), -) + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + type=pyarrow.time64("ns"), + ), + ), +] +SERIES_ARRAYS_CUSTOM_ARROW_TYPES = [ + (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())), + ( + pandas.Series([None, None, None], dtype="dbdate"), + pyarrow.array([None, None, None], type=pyarrow.date64()), + ), + ( + pandas.Series( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" + ), + pyarrow.array( + [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date64(), + ), + ), + ( + pandas.Series( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + dtype="dbdate", + ), + pyarrow.array( + [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], + type=pyarrow.date64(), + ), + ), + (pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time32("ms")),), + ( + pandas.Series([None, None, None], dtype="dbtime"), + pyarrow.array([None, None, None], type=pyarrow.time32("ms")), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], dtype="dbtime", + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], + type=pyarrow.time32("ms"), + ), + ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_000), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_000), + ], + type=pyarrow.time32("ms"), + ), + ), + ( + pandas.Series( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime", + ), + pyarrow.array( + [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], + type=pyarrow.time64("us"), + ), + ), + ( + pandas.Series( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + dt.time(0, 0, 0, 0), + dt.time(12, 30, 15, 125_000), + dt.time(23, 59, 59, 999_999), + ], + type=pyarrow.time64("us"), + ), + ), + ( + pandas.Series( + [ + # Only microseconds are supported when reading data. See: + # https://github.com/googleapis/python-db-dtypes-pandas/issues/19 + # Still, round-trip with pyarrow nanosecond precision scalars + # is supported. + pyarrow.scalar(0, pyarrow.time64("ns")), + pyarrow.scalar( + 12 * HOUR_NANOS + + 30 * MINUTE_NANOS + + 15 * SECOND_NANOS + + 123_456_789, + pyarrow.time64("ns"), + ), + pyarrow.scalar( + 23 * HOUR_NANOS + + 59 * MINUTE_NANOS + + 59 * SECOND_NANOS + + 999_999_999, + pyarrow.time64("ns"), + ), + ], + dtype="dbtime", + ), + pyarrow.array( + [ + 0, + 12 * HOUR_NANOS + 30 * MINUTE_NANOS + 15 * SECOND_NANOS + 123_456_789, + 23 * HOUR_NANOS + 59 * MINUTE_NANOS + 59 * SECOND_NANOS + 999_999_999, + ], + type=pyarrow.time64("ns"), + ), + ), +] + + +@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_DEFAULT_TYPES) def test_to_arrow(series, expected): array = pyarrow.array(series) assert array.equals(expected) +@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_CUSTOM_ARROW_TYPES) +def test_to_arrow_w_arrow_type(series, expected): + array = pyarrow.array(series, type=expected.type) + assert array.equals(expected) + + @pytest.mark.parametrize( - ("series", "expected"), - ( - (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())), - ( - pandas.Series([None, None, None], dtype="dbdate"), - pyarrow.array([None, None, None], type=pyarrow.date64()), - ), - ( - pandas.Series( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" - ), - pyarrow.array( - [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], - type=pyarrow.date64(), - ), - ), - ( - pandas.Series( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - dtype="dbdate", - ), - pyarrow.array( - [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], - type=pyarrow.date64(), - ), - ), - ( - pandas.Series([], dtype="dbtime"), - pyarrow.array([], type=pyarrow.time32("ms")), - ), - ( - pandas.Series([None, None, None], dtype="dbtime"), - pyarrow.array([None, None, None], type=pyarrow.time32("ms")), - ), - ( - pandas.Series( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], - dtype="dbtime", - ), + ["expected", "pyarrow_array"], + SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES, +) +def test_series_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series): + # Convert to RecordBatch because types_mapper argument is ignored when + # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 + record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) + dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + series = dataframe["test_col"] + pandas.testing.assert_series_equal(series, expected, check_names=False) + + +@pytest.mark.parametrize( + ["expected", "pyarrow_array"], + SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES, +) +def test_series_from_arrow_scalars( + pyarrow_array: pyarrow.Array, expected: pandas.Series +): + scalars = [] + for scalar in pyarrow_array: + scalars.append(scalar) + assert isinstance(scalar, pyarrow.Scalar) + series = pandas.Series(scalars, dtype=expected.dtype) + pandas.testing.assert_series_equal(series, expected) + + +def test_dbtime_series_from_arrow_array(): + """Test to explicitly check Array -> Series conversion.""" + array = pyarrow.array([dt.time(15, 21, 0, 123_456)], type=pyarrow.time64("us")) + assert isinstance(array, pyarrow.Array) + assert not isinstance(array, pyarrow.ChunkedArray) + series = pandas.Series(db_dtypes.TimeDtype.__from_arrow__(array)) + expected = pandas.Series([dt.time(15, 21, 0, 123_456)], dtype="dbtime") + pandas.testing.assert_series_equal(series, expected) + + +def test_dbtime_series_from_arrow_chunkedarray(): + """Test to explicitly check ChunkedArray -> Series conversion.""" + array1 = pyarrow.array([dt.time(15, 21, 0, 123_456)], type=pyarrow.time64("us")) + array2 = pyarrow.array([dt.time(0, 0, 0, 0)], type=pyarrow.time64("us")) + array = pyarrow.chunked_array([array1, array2]) + assert isinstance(array, pyarrow.ChunkedArray) + series = pandas.Series(db_dtypes.TimeDtype.__from_arrow__(array)) + expected = pandas.Series( + [dt.time(15, 21, 0, 123_456), dt.time(0, 0, 0, 0)], dtype="dbtime" + ) + pandas.testing.assert_series_equal(series, expected) + + +def test_dataframe_from_arrow(): + record_batch = pyarrow.RecordBatch.from_arrays( + [ pyarrow.array( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], - type=pyarrow.time32("ms"), - ), - ), - ( - pandas.Series( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - dtype="dbtime", + [dt.date(2021, 11, 4), dt.date(2038, 1, 20), None, dt.date(1970, 1, 1)], + type=pyarrow.date32(), ), pyarrow.array( - [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], - type=pyarrow.time64("us"), - ), - ), - ( - pandas.Series( [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), + dt.time(10, 7, 8, 995_325), dt.time(23, 59, 59, 999_999), + None, + dt.time(0, 0, 0, 0), ], - dtype="dbtime", + type=pyarrow.time64("us"), ), - pyarrow.array( + pyarrow.array([1, 2, 3, 4]), + ], + ["date_col", "time_col", "int_col"], + ) + dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) + expected = pandas.DataFrame( + { + "date_col": pandas.Series( + [dt.date(2021, 11, 4), dt.date(2038, 1, 20), None, dt.date(1970, 1, 1)], + dtype="dbdate", + ), + "time_col": pandas.Series( [ - dt.time(0, 0, 0, 0), - dt.time(12, 30, 15, 125_000), + dt.time(10, 7, 8, 995_325), dt.time(23, 59, 59, 999_999), + None, + dt.time(0, 0, 0, 0), ], - type=pyarrow.time64("us"), + dtype="dbtime", ), - ), - ), -) -def test_to_arrow_w_arrow_type(series, expected): - array = pyarrow.array(series, type=expected.type) - assert array.equals(expected) + "int_col": [1, 2, 3, 4], + }, + columns=["date_col", "time_col", "int_col"], + ) + pandas.testing.assert_frame_equal(dataframe, expected) From 8f248cbf87ebbc2c1bf3d3aaa0e1802f5406aa04 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 8 Nov 2021 23:00:16 +0000 Subject: [PATCH 6/6] chore: release 0.3.0 (#41) :robot: I have created a release \*beep\* \*boop\* --- ## [0.3.0](https://www.github.com/googleapis/python-db-dtypes-pandas/compare/v0.2.0...v0.3.0) (2021-11-08) ### Features * support conversion from pyarrow RecordBatch to pandas DataFrame ([#39](https://www.github.com/googleapis/python-db-dtypes-pandas/issues/39)) ([facc7b0](https://www.github.com/googleapis/python-db-dtypes-pandas/commit/facc7b0897e27c5ba99399b7d453818c5b4aeca7)) * support Python 3.10 ([#40](https://www.github.com/googleapis/python-db-dtypes-pandas/issues/40)) ([a31d55d](https://www.github.com/googleapis/python-db-dtypes-pandas/commit/a31d55db57b2f5655b1fee4230a930d5bee4b1c9)) --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- CHANGELOG.md | 8 ++++++++ db_dtypes/version.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b78ab3..3f956b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.3.0](https://www.github.com/googleapis/python-db-dtypes-pandas/compare/v0.2.0...v0.3.0) (2021-11-08) + + +### Features + +* support conversion from pyarrow RecordBatch to pandas DataFrame ([#39](https://www.github.com/googleapis/python-db-dtypes-pandas/issues/39)) ([facc7b0](https://www.github.com/googleapis/python-db-dtypes-pandas/commit/facc7b0897e27c5ba99399b7d453818c5b4aeca7)) +* support Python 3.10 ([#40](https://www.github.com/googleapis/python-db-dtypes-pandas/issues/40)) ([a31d55d](https://www.github.com/googleapis/python-db-dtypes-pandas/commit/a31d55db57b2f5655b1fee4230a930d5bee4b1c9)) + ## [0.2.0](https://www.github.com/googleapis/python-db-dtypes-pandas/compare/v0.1.1...v0.2.0) (2021-10-14) diff --git a/db_dtypes/version.py b/db_dtypes/version.py index 4da46cc..005815d 100644 --- a/db_dtypes/version.py +++ b/db_dtypes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.0" +__version__ = "0.3.0"