diff --git a/.coveragerc b/.coveragerc
index ad02e9b..0f8f905 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -18,6 +18,7 @@
[run]
branch = True
omit =
+ google/__init__.py
db_dtypes/requirements.py
[report]
diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml
index 7d98291..cb89b2e 100644
--- a/.github/.OwlBot.lock.yaml
+++ b/.github/.OwlBot.lock.yaml
@@ -1,3 +1,3 @@
docker:
image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest
- digest: sha256:58f73ba196b5414782605236dd0712a73541b44ff2ff4d3a36ec41092dd6fa5b
+ digest: sha256:ec49167c606648a063d1222220b48119c912562849a0528f35bfb592a9f72737
diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg
index abd7ad7..c790a9a 100644
--- a/.kokoro/docs/common.cfg
+++ b/.kokoro/docs/common.cfg
@@ -30,7 +30,9 @@ env_vars: {
env_vars: {
key: "V2_STAGING_BUCKET"
- value: "docs-staging-v2"
+ # Push non-cloud library docs to `docs-staging-v2-staging` instead of the
+ # Cloud RAD bucket `docs-staging-v2`
+ value: "docs-staging-v2-staging"
}
# It will upload the docker image after successful builds.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b78ab3..3f956b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
# Changelog
+## [0.3.0](https://www.github.com/googleapis/python-db-dtypes-pandas/compare/v0.2.0...v0.3.0) (2021-11-08)
+
+
+### Features
+
+* support conversion from pyarrow RecordBatch to pandas DataFrame ([#39](https://www.github.com/googleapis/python-db-dtypes-pandas/issues/39)) ([facc7b0](https://www.github.com/googleapis/python-db-dtypes-pandas/commit/facc7b0897e27c5ba99399b7d453818c5b4aeca7))
+* support Python 3.10 ([#40](https://www.github.com/googleapis/python-db-dtypes-pandas/issues/40)) ([a31d55d](https://www.github.com/googleapis/python-db-dtypes-pandas/commit/a31d55db57b2f5655b1fee4230a930d5bee4b1c9))
+
## [0.2.0](https://www.github.com/googleapis/python-db-dtypes-pandas/compare/v0.1.1...v0.2.0) (2021-10-14)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index f066db3..22f6382 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -22,7 +22,7 @@ In order to add a feature:
documentation.
- The feature must work fully on the following CPython versions:
- 3.6, 3.7, 3.8 and 3.9 on both UNIX and Windows.
+ 3.6, 3.7, 3.8, 3.9 and 3.10 on both UNIX and Windows.
- The feature must not add unnecessary dependencies (where
"unnecessary" is of course subjective, but new dependencies should
@@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests.
- To run a single unit test::
- $ nox -s unit-3.9 -- -k
+ $ nox -s unit-3.10 -- -k
.. note::
@@ -225,11 +225,13 @@ We support:
- `Python 3.7`_
- `Python 3.8`_
- `Python 3.9`_
+- `Python 3.10`_
.. _Python 3.6: https://docs.python.org/3.6/
.. _Python 3.7: https://docs.python.org/3.7/
.. _Python 3.8: https://docs.python.org/3.8/
.. _Python 3.9: https://docs.python.org/3.9/
+.. _Python 3.10: https://docs.python.org/3.10/
Supported versions can be found in our ``noxfile.py`` `config`_.
diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index bce2bf0..f1424fb 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -17,6 +17,7 @@
import datetime
import re
+from typing import Union
import numpy
import packaging.version
@@ -29,6 +30,7 @@
import pandas.core.dtypes.generic
import pandas.core.nanops
import pyarrow
+import pyarrow.compute
from db_dtypes.version import __version__
from db_dtypes import core
@@ -36,6 +38,8 @@
date_dtype_name = "dbdate"
time_dtype_name = "dbtime"
+_EPOCH = datetime.datetime(1970, 1, 1)
+_NPEPOCH = numpy.datetime64(_EPOCH)
pandas_release = packaging.version.parse(pandas.__version__).release
@@ -52,6 +56,33 @@ class TimeDtype(core.BaseDatetimeDtype):
def construct_array_type(self):
return TimeArray
+ @staticmethod
+ def __from_arrow__(
+ array: Union[pyarrow.Array, pyarrow.ChunkedArray]
+ ) -> "TimeArray":
+ """Convert to dbtime data from an Arrow array.
+
+ See:
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+ """
+ # We can't call combine_chunks on an empty array, so short-circuit the
+ # rest of the function logic for this special case.
+ if len(array) == 0:
+ return TimeArray(numpy.array([], dtype="datetime64[ns]"))
+
+ # We can't cast to timestamp("ns"), but time64("ns") has the same
+ # memory layout: 64-bit integers representing the number of nanoseconds
+ # since the datetime epoch (midnight 1970-01-01).
+ array = pyarrow.compute.cast(array, pyarrow.time64("ns"))
+
+ # ChunkedArray has no "view" method, so combine into an Array.
+ if isinstance(array, pyarrow.ChunkedArray):
+ array = array.combine_chunks()
+
+ array = array.view(pyarrow.timestamp("ns"))
+ np_array = array.to_numpy(zero_copy_only=False)
+ return TimeArray(np_array)
+
class TimeArray(core.BaseDatetimeArray):
"""
@@ -61,8 +92,6 @@ class TimeArray(core.BaseDatetimeArray):
# Data are stored as datetime64 values with a date of Jan 1, 1970
dtype = TimeDtype()
- _epoch = datetime.datetime(1970, 1, 1)
- _npepoch = numpy.datetime64(_epoch)
@classmethod
def _datetime(
@@ -75,8 +104,21 @@ def _datetime(
r"(?:\.(?P\d*))?)?)?\s*$"
).match,
):
- if isinstance(scalar, datetime.time):
- return datetime.datetime.combine(cls._epoch, scalar)
+ # Convert pyarrow values to datetime.time.
+ if isinstance(scalar, (pyarrow.Time32Scalar, pyarrow.Time64Scalar)):
+ scalar = (
+ scalar.cast(pyarrow.time64("ns"))
+ .cast(pyarrow.int64())
+ .cast(pyarrow.timestamp("ns"))
+ .as_py()
+ )
+
+ if scalar is None:
+ return None
+ elif isinstance(scalar, datetime.time):
+ return datetime.datetime.combine(_EPOCH, scalar)
+ elif isinstance(scalar, pandas.Timestamp):
+ return scalar.to_datetime64()
elif isinstance(scalar, str):
# iso string
parsed = match_fn(scalar)
@@ -113,7 +155,7 @@ def _box_func(self, x):
__return_deltas = {"timedelta", "timedelta64", "timedelta64[ns]", " "DateArray":
+ """Convert to dbdate data from an Arrow array.
+
+ See:
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+ """
+ array = pyarrow.compute.cast(array, pyarrow.timestamp("ns"))
+ np_array = array.to_numpy()
+ return DateArray(np_array)
+
class DateArray(core.BaseDatetimeArray):
"""
@@ -161,7 +226,13 @@ def _datetime(
scalar,
match_fn=re.compile(r"\s*(?P\d+)-(?P\d+)-(?P\d+)\s*$").match,
):
- if isinstance(scalar, datetime.date):
+ # Convert pyarrow values to datetime.date.
+ if isinstance(scalar, (pyarrow.Date32Scalar, pyarrow.Date64Scalar)):
+ scalar = scalar.as_py()
+
+ if scalar is None:
+ return None
+ elif isinstance(scalar, datetime.date):
return datetime.datetime(scalar.year, scalar.month, scalar.day)
elif isinstance(scalar, str):
match = match_fn(scalar)
@@ -197,8 +268,14 @@ def astype(self, dtype, copy=True):
return super().astype(dtype, copy=copy)
def __arrow_array__(self, type=None):
- return pyarrow.array(
- self._ndarray, type=type if type is not None else pyarrow.date32(),
+ """Convert to an Arrow array from dbdate data.
+
+ See:
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+ """
+ array = pyarrow.array(self._ndarray, type=pyarrow.timestamp("ns"))
+ return pyarrow.compute.cast(
+ array, type if type is not None else pyarrow.date32(),
)
def __add__(self, other):
@@ -206,7 +283,7 @@ def __add__(self, other):
return self.astype("object") + other
if isinstance(other, TimeArray):
- return (other._ndarray - other._npepoch) + self._ndarray
+ return (other._ndarray - _NPEPOCH) + self._ndarray
return super().__add__(other)
diff --git a/db_dtypes/core.py b/db_dtypes/core.py
index fbc784e..c8f3ad4 100644
--- a/db_dtypes/core.py
+++ b/db_dtypes/core.py
@@ -17,6 +17,7 @@
import numpy
import pandas
from pandas._libs import NaT
+import pandas.api.extensions
import pandas.compat.numpy.function
import pandas.core.algorithms
import pandas.core.arrays
@@ -32,7 +33,7 @@
pandas_release = pandas_backports.pandas_release
-class BaseDatetimeDtype(pandas.core.dtypes.base.ExtensionDtype):
+class BaseDatetimeDtype(pandas.api.extensions.ExtensionDtype):
na_value = NaT
kind = "o"
names = None
@@ -60,10 +61,7 @@ def __init__(self, values, dtype=None, copy: bool = False):
@classmethod
def __ndarray(cls, scalars):
- return numpy.array(
- [None if scalar is None else cls._datetime(scalar) for scalar in scalars],
- "M8[ns]",
- )
+ return numpy.array([cls._datetime(scalar) for scalar in scalars], "M8[ns]",)
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
diff --git a/db_dtypes/version.py b/db_dtypes/version.py
index 4da46cc..005815d 100644
--- a/db_dtypes/version.py
+++ b/db_dtypes/version.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "0.2.0"
+__version__ = "0.3.0"
diff --git a/noxfile.py b/noxfile.py
index f2a2bed..5f48361 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -29,7 +29,7 @@
DEFAULT_PYTHON_VERSION = "3.8"
SYSTEM_TEST_PYTHON_VERSIONS = ["3.8"]
-UNIT_TEST_PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9"]
+UNIT_TEST_PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"]
CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute()
diff --git a/owlbot.py b/owlbot.py
index e6c264c..30f3b3d 100644
--- a/owlbot.py
+++ b/owlbot.py
@@ -28,7 +28,7 @@
# Add templated files
# ----------------------------------------------------------------------------
templated_files = common.py_library(
- unit_test_python_versions=["3.6", "3.7", "3.8", "3.9"],
+ unit_test_python_versions=["3.6", "3.7", "3.8", "3.9", "3.10"],
system_test_python_versions=["3.8"],
cov_level=100,
intersphinx_dependencies={
@@ -50,7 +50,7 @@
)
s.replace(
- ["noxfile.py"], "google/cloud", "db_dtypes",
+ ["noxfile.py"], "--cov=google", "--cov=db_dtypes",
)
# There are no system tests for this package.
diff --git a/setup.py b/setup.py
index 8e1e355..8def678 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
dependencies = [
"packaging >= 17.0",
"pandas >= 0.24.2, < 2.0dev",
- "pyarrow >= 3.0.0, < 6.0dev",
+ "pyarrow>=3.0.0, <7.0dev",
"numpy >= 1.16.6, < 2.0dev",
]
@@ -66,11 +66,12 @@ def readme():
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
"Operating System :: OS Independent",
"Topic :: Database :: Front-Ends",
],
platforms="Posix; MacOS X; Windows",
install_requires=dependencies,
- python_requires=">=3.6, <3.10",
+ python_requires=">=3.6, <3.11",
tests_require=["pytest"],
)
diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_arrow.py b/tests/unit/test_arrow.py
index d3745ea..5f45a90 100644
--- a/tests/unit/test_arrow.py
+++ b/tests/unit/test_arrow.py
@@ -13,160 +13,314 @@
# limitations under the License.
import datetime as dt
+from typing import Optional
import pandas
+import pandas.api.extensions
+import pandas.testing
import pyarrow
import pytest
-# To register the types.
-import db_dtypes # noqa
+import db_dtypes
-@pytest.mark.parametrize(
- ("series", "expected"),
+SECOND_NANOS = 1_000_000_000
+MINUTE_NANOS = 60 * SECOND_NANOS
+HOUR_NANOS = 60 * MINUTE_NANOS
+
+
+def types_mapper(
+ pyarrow_type: pyarrow.DataType,
+) -> Optional[pandas.api.extensions.ExtensionDtype]:
+ type_str = str(pyarrow_type)
+
+ if type_str.startswith("date32") or type_str.startswith("date64"):
+ return db_dtypes.DateDtype
+ elif type_str.startswith("time32") or type_str.startswith("time64"):
+ return db_dtypes.TimeDtype
+ else:
+ # Use default type mapping.
+ return None
+
+
+SERIES_ARRAYS_DEFAULT_TYPES = [
+ (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())),
(
- (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())),
- (
- pandas.Series([None, None, None], dtype="dbdate"),
- pyarrow.array([None, None, None], type=pyarrow.date32()),
- ),
- (
- pandas.Series(
- [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate"
- ),
- pyarrow.array(
- [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)],
- type=pyarrow.date32(),
- ),
+ pandas.Series([None, None, None], dtype="dbdate"),
+ pyarrow.array([None, None, None], type=pyarrow.date32()),
+ ),
+ (
+ pandas.Series(
+ [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate"
),
- (
- pandas.Series(
- [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
- dtype="dbdate",
- ),
- pyarrow.array(
- [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
- type=pyarrow.date32(),
- ),
+ pyarrow.array(
+ [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date32(),
),
- (
- pandas.Series([], dtype="dbtime"),
- pyarrow.array([], type=pyarrow.time64("ns")),
+ ),
+ (
+ pandas.Series(
+ [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
+ dtype="dbdate",
),
- (
- pandas.Series([None, None, None], dtype="dbtime"),
- pyarrow.array([None, None, None], type=pyarrow.time64("ns")),
+ pyarrow.array(
+ [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
+ type=pyarrow.date32(),
),
- (
- pandas.Series(
- [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)],
- dtype="dbtime",
- ),
- pyarrow.array(
- [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)],
- type=pyarrow.time64("ns"),
- ),
+ ),
+ (pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time64("ns")),),
+ (
+ pandas.Series([None, None, None], dtype="dbtime"),
+ pyarrow.array([None, None, None], type=pyarrow.time64("ns")),
+ ),
+ (
+ pandas.Series(
+ [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime",
),
- (
- pandas.Series(
- [
- dt.time(0, 0, 0, 0),
- dt.time(12, 30, 15, 125_000),
- dt.time(23, 59, 59, 999_999),
- ],
- dtype="dbtime",
- ),
- pyarrow.array(
- [
- dt.time(0, 0, 0, 0),
- dt.time(12, 30, 15, 125_000),
- dt.time(23, 59, 59, 999_999),
- ],
- type=pyarrow.time64("ns"),
- ),
+ pyarrow.array(
+ [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)],
+ type=pyarrow.time64("ns"),
),
),
-)
+ (
+ pandas.Series(
+ [
+ dt.time(0, 0, 0, 0),
+ dt.time(12, 30, 15, 125_000),
+ dt.time(23, 59, 59, 999_999),
+ ],
+ dtype="dbtime",
+ ),
+ pyarrow.array(
+ [
+ dt.time(0, 0, 0, 0),
+ dt.time(12, 30, 15, 125_000),
+ dt.time(23, 59, 59, 999_999),
+ ],
+ type=pyarrow.time64("ns"),
+ ),
+ ),
+]
+SERIES_ARRAYS_CUSTOM_ARROW_TYPES = [
+ (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())),
+ (
+ pandas.Series([None, None, None], dtype="dbdate"),
+ pyarrow.array([None, None, None], type=pyarrow.date64()),
+ ),
+ (
+ pandas.Series(
+ [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate"
+ ),
+ pyarrow.array(
+ [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date64(),
+ ),
+ ),
+ (
+ pandas.Series(
+ [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
+ dtype="dbdate",
+ ),
+ pyarrow.array(
+ [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
+ type=pyarrow.date64(),
+ ),
+ ),
+ (pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time32("ms")),),
+ (
+ pandas.Series([None, None, None], dtype="dbtime"),
+ pyarrow.array([None, None, None], type=pyarrow.time32("ms")),
+ ),
+ (
+ pandas.Series(
+ [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], dtype="dbtime",
+ ),
+ pyarrow.array(
+ [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)],
+ type=pyarrow.time32("ms"),
+ ),
+ ),
+ (
+ pandas.Series(
+ [
+ dt.time(0, 0, 0, 0),
+ dt.time(12, 30, 15, 125_000),
+ dt.time(23, 59, 59, 999_000),
+ ],
+ dtype="dbtime",
+ ),
+ pyarrow.array(
+ [
+ dt.time(0, 0, 0, 0),
+ dt.time(12, 30, 15, 125_000),
+ dt.time(23, 59, 59, 999_000),
+ ],
+ type=pyarrow.time32("ms"),
+ ),
+ ),
+ (
+ pandas.Series(
+ [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime",
+ ),
+ pyarrow.array(
+ [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)],
+ type=pyarrow.time64("us"),
+ ),
+ ),
+ (
+ pandas.Series(
+ [
+ dt.time(0, 0, 0, 0),
+ dt.time(12, 30, 15, 125_000),
+ dt.time(23, 59, 59, 999_999),
+ ],
+ dtype="dbtime",
+ ),
+ pyarrow.array(
+ [
+ dt.time(0, 0, 0, 0),
+ dt.time(12, 30, 15, 125_000),
+ dt.time(23, 59, 59, 999_999),
+ ],
+ type=pyarrow.time64("us"),
+ ),
+ ),
+ (
+ pandas.Series(
+ [
+ # Only microseconds are supported when reading data. See:
+ # https://github.com/googleapis/python-db-dtypes-pandas/issues/19
+ # Still, round-trip with pyarrow nanosecond precision scalars
+ # is supported.
+ pyarrow.scalar(0, pyarrow.time64("ns")),
+ pyarrow.scalar(
+ 12 * HOUR_NANOS
+ + 30 * MINUTE_NANOS
+ + 15 * SECOND_NANOS
+ + 123_456_789,
+ pyarrow.time64("ns"),
+ ),
+ pyarrow.scalar(
+ 23 * HOUR_NANOS
+ + 59 * MINUTE_NANOS
+ + 59 * SECOND_NANOS
+ + 999_999_999,
+ pyarrow.time64("ns"),
+ ),
+ ],
+ dtype="dbtime",
+ ),
+ pyarrow.array(
+ [
+ 0,
+ 12 * HOUR_NANOS + 30 * MINUTE_NANOS + 15 * SECOND_NANOS + 123_456_789,
+ 23 * HOUR_NANOS + 59 * MINUTE_NANOS + 59 * SECOND_NANOS + 999_999_999,
+ ],
+ type=pyarrow.time64("ns"),
+ ),
+ ),
+]
+
+
+@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_DEFAULT_TYPES)
def test_to_arrow(series, expected):
array = pyarrow.array(series)
assert array.equals(expected)
+@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_CUSTOM_ARROW_TYPES)
+def test_to_arrow_w_arrow_type(series, expected):
+ array = pyarrow.array(series, type=expected.type)
+ assert array.equals(expected)
+
+
@pytest.mark.parametrize(
- ("series", "expected"),
- (
- (pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())),
- (
- pandas.Series([None, None, None], dtype="dbdate"),
- pyarrow.array([None, None, None], type=pyarrow.date64()),
- ),
- (
- pandas.Series(
- [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate"
- ),
- pyarrow.array(
- [dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)],
- type=pyarrow.date64(),
- ),
- ),
- (
- pandas.Series(
- [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
- dtype="dbdate",
- ),
- pyarrow.array(
- [dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)],
- type=pyarrow.date64(),
- ),
- ),
- (
- pandas.Series([], dtype="dbtime"),
- pyarrow.array([], type=pyarrow.time32("ms")),
- ),
- (
- pandas.Series([None, None, None], dtype="dbtime"),
- pyarrow.array([None, None, None], type=pyarrow.time32("ms")),
- ),
- (
- pandas.Series(
- [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)],
- dtype="dbtime",
- ),
+ ["expected", "pyarrow_array"],
+ SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES,
+)
+def test_series_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series):
+ # Convert to RecordBatch because types_mapper argument is ignored when
+ # using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664
+ record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"])
+ dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper)
+ series = dataframe["test_col"]
+ pandas.testing.assert_series_equal(series, expected, check_names=False)
+
+
+@pytest.mark.parametrize(
+ ["expected", "pyarrow_array"],
+ SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES,
+)
+def test_series_from_arrow_scalars(
+ pyarrow_array: pyarrow.Array, expected: pandas.Series
+):
+ scalars = []
+ for scalar in pyarrow_array:
+ scalars.append(scalar)
+ assert isinstance(scalar, pyarrow.Scalar)
+ series = pandas.Series(scalars, dtype=expected.dtype)
+ pandas.testing.assert_series_equal(series, expected)
+
+
+def test_dbtime_series_from_arrow_array():
+ """Test to explicitly check Array -> Series conversion."""
+ array = pyarrow.array([dt.time(15, 21, 0, 123_456)], type=pyarrow.time64("us"))
+ assert isinstance(array, pyarrow.Array)
+ assert not isinstance(array, pyarrow.ChunkedArray)
+ series = pandas.Series(db_dtypes.TimeDtype.__from_arrow__(array))
+ expected = pandas.Series([dt.time(15, 21, 0, 123_456)], dtype="dbtime")
+ pandas.testing.assert_series_equal(series, expected)
+
+
+def test_dbtime_series_from_arrow_chunkedarray():
+ """Test to explicitly check ChunkedArray -> Series conversion."""
+ array1 = pyarrow.array([dt.time(15, 21, 0, 123_456)], type=pyarrow.time64("us"))
+ array2 = pyarrow.array([dt.time(0, 0, 0, 0)], type=pyarrow.time64("us"))
+ array = pyarrow.chunked_array([array1, array2])
+ assert isinstance(array, pyarrow.ChunkedArray)
+ series = pandas.Series(db_dtypes.TimeDtype.__from_arrow__(array))
+ expected = pandas.Series(
+ [dt.time(15, 21, 0, 123_456), dt.time(0, 0, 0, 0)], dtype="dbtime"
+ )
+ pandas.testing.assert_series_equal(series, expected)
+
+
+def test_dataframe_from_arrow():
+ record_batch = pyarrow.RecordBatch.from_arrays(
+ [
pyarrow.array(
- [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)],
- type=pyarrow.time32("ms"),
- ),
- ),
- (
- pandas.Series(
- [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)],
- dtype="dbtime",
+ [dt.date(2021, 11, 4), dt.date(2038, 1, 20), None, dt.date(1970, 1, 1)],
+ type=pyarrow.date32(),
),
pyarrow.array(
- [dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)],
- type=pyarrow.time64("us"),
- ),
- ),
- (
- pandas.Series(
[
- dt.time(0, 0, 0, 0),
- dt.time(12, 30, 15, 125_000),
+ dt.time(10, 7, 8, 995_325),
dt.time(23, 59, 59, 999_999),
+ None,
+ dt.time(0, 0, 0, 0),
],
- dtype="dbtime",
+ type=pyarrow.time64("us"),
),
- pyarrow.array(
+ pyarrow.array([1, 2, 3, 4]),
+ ],
+ ["date_col", "time_col", "int_col"],
+ )
+ dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper)
+ expected = pandas.DataFrame(
+ {
+ "date_col": pandas.Series(
+ [dt.date(2021, 11, 4), dt.date(2038, 1, 20), None, dt.date(1970, 1, 1)],
+ dtype="dbdate",
+ ),
+ "time_col": pandas.Series(
[
- dt.time(0, 0, 0, 0),
- dt.time(12, 30, 15, 125_000),
+ dt.time(10, 7, 8, 995_325),
dt.time(23, 59, 59, 999_999),
+ None,
+ dt.time(0, 0, 0, 0),
],
- type=pyarrow.time64("us"),
+ dtype="dbtime",
),
- ),
- ),
-)
-def test_to_arrow_w_arrow_type(series, expected):
- array = pyarrow.array(series, type=expected.type)
- assert array.equals(expected)
+ "int_col": [1, 2, 3, 4],
+ },
+ columns=["date_col", "time_col", "int_col"],
+ )
+ pandas.testing.assert_frame_equal(dataframe, expected)