From 70b17422cb14ae153438139a79dddaae258d421b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 20:50:46 +0000 Subject: [PATCH 1/7] chore: Use fixture for list/str accessor tests --- tests/data/repeated.jsonl | 4 +++ tests/data/repeated_schema.json | 12 +++++++ tests/system/conftest.py | 36 +++++++++++++++++++ tests/system/small/operations/test_lists.py | 29 ++++++--------- tests/system/small/operations/test_strings.py | 13 +++---- 5 files changed, 69 insertions(+), 25 deletions(-) create mode 100644 tests/data/repeated.jsonl create mode 100644 tests/data/repeated_schema.json diff --git a/tests/data/repeated.jsonl b/tests/data/repeated.jsonl new file mode 100644 index 0000000000..eb9c4317ac --- /dev/null +++ b/tests/data/repeated.jsonl @@ -0,0 +1,4 @@ +{"rowindex": 0, "list_col": [1]} +{"rowindex": 1, "list_col": [1,2]} +{"rowindex": 2, "list_col": [1,2,3]} +{"rowindex": 3, "list_col": [1,2,3,4]} diff --git a/tests/data/repeated_schema.json b/tests/data/repeated_schema.json new file mode 100644 index 0000000000..e6f32bd9e1 --- /dev/null +++ b/tests/data/repeated_schema.json @@ -0,0 +1,12 @@ +[ + { + "name": "rowindex", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "list_col", + "type": "INTEGER", + "mode": "REPEATED" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 49cd887cfd..99257c6c00 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -32,6 +32,7 @@ import ibis.backends import numpy as np import pandas as pd +import pyarrow as pa import pytest import pytz import test_utils.prefixer @@ -39,6 +40,7 @@ import bigframes import bigframes.dataframe import bigframes.pandas as bpd +import bigframes.series import tests.system.utils # Use this to control the number of cloud functions being deleted in a single @@ -294,6 +296,7 @@ def load_test_data_tables( ("scalars", "scalars_schema.json", "scalars.jsonl"), ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("nested", "nested_schema.json", "nested.jsonl"), + ("repeated", "repeated_schema.json", "repeated.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), ("hockey_players", "hockey_players.json", "hockey_players.jsonl"), @@ -369,6 +372,9 @@ def scalars_table_tokyo(test_data_tables_tokyo) -> str: def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] +@pytest.fixture(scope="session") +def repeated_table_id(test_data_tables) -> str: + return test_data_tables["repeated"] @pytest.fixture(scope="session") def penguins_table_id(test_data_tables) -> str: @@ -409,6 +415,36 @@ def nested_pandas_df() -> pd.DataFrame: df = df.set_index("rowindex") return df +@pytest.fixture(scope="session") +def repeated_df( + repeated_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """Returns a DataFrame containing columns of list type.""" + return session.read_gbq(repeated_table_id, index_col="rowindex") + +@pytest.fixture(scope="session") +def repeated_series( + repeated_df: bigframes.dataframe.DataFrame +) -> bigframes.series.Series: + """Returns a Series of lists""" + return repeated_df["list_col"] + + +@pytest.fixture(scope="session") +def repeated_pandas_df() -> pd.DataFrame: + """Returns a DataFrame containing columns of list type.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + df = df.set_index("rowindex") + return df + +@pytest.fixture(scope="session") +def repeated_pandas_series(repeated_pandas_df: pd.DataFrame) -> pd.Series: + """pd.DataFrame pointing at test data.""" + return repeated_pandas_df["list_col"].astype(pd.ArrowDtype(pa.list_(pa.int64()))) @pytest.fixture(scope="session") def scalars_df_default_index( diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 7ecf79dc6a..5146b8047e 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -32,19 +32,16 @@ pytest.param(slice(0, 2, None), id="default_step_slice"), ], ) -def test_getitem(key): +def test_getitem(key, repeated_series, repeated_pandas_series): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - data = [[1], [2, 3], [4, 5, 6]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - bf_result = s.list[key].to_pandas() - pd_result = pd_s.list[key] + bf_result = repeated_series.list[key].to_pandas() + pd_result = repeated_pandas_series.list[key] - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False, check_names=False) @pytest.mark.parametrize( @@ -60,24 +57,18 @@ def test_getitem(key): (slice(0, 2, 2), pytest.raises(NotImplementedError)), ], ) -def test_getitem_notsupported(key, expectation): - data = [[1], [2, 3], [4, 5, 6]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - +def test_getitem_notsupported(key, expectation, repeated_series): with expectation as e: - assert s.list[key] == e + assert repeated_series.list[key] == e -def test_len(): +def test_len(repeated_series, repeated_pandas_series): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - data = [[], [1], [1, 2], [1, 2, 3]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - bf_result = s.list.len().to_pandas() - pd_result = pd_s.list.len() + bf_result = repeated_series.list.len().to_pandas() + pd_result = repeated_pandas_series.list.len() - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False, check_names=False) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 15e8512317..56b08f96cb 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -622,13 +622,14 @@ def test_getitem_w_string(scalars_dfs, index): pytest.param(slice(0, 0, None), id="single_one_slice"), ], ) -def test_getitem_w_array(index): - data = [[1], [2, 3], [], [4, 5, 6]] - s = bpd.Series(data) - pd_s = pd.Series(data) +def test_getitem_w_array(index, repeated_series, repeated_pandas_df): + bf_result = repeated_series.str[index].to_pandas() + # We use repeated_pandas_df['list_col'] instead of repeated_pandas_series. + # Reason: The series fixture contains lists that are strongly-typed as PyArrow lists. + # Using str accessor on this type would fail the type check. However, the default + # type for lists is Object in pandas, and it happily accepts str accessors. + pd_result = repeated_pandas_df['list_col'].str[index] - bf_result = s.str[index].to_pandas() - pd_result = pd_s.str[index] # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) From 9994d261dae9560e5d8e4995338aceefb44dc1e9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 20:52:30 +0000 Subject: [PATCH 2/7] fix format --- tests/system/conftest.py | 8 +++++++- tests/system/small/operations/test_lists.py | 16 ++++++++++++++-- tests/system/small/operations/test_strings.py | 8 ++++---- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 99257c6c00..9d3e25e080 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -372,10 +372,12 @@ def scalars_table_tokyo(test_data_tables_tokyo) -> str: def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] + @pytest.fixture(scope="session") def repeated_table_id(test_data_tables) -> str: return test_data_tables["repeated"] + @pytest.fixture(scope="session") def penguins_table_id(test_data_tables) -> str: return test_data_tables["penguins"] @@ -415,6 +417,7 @@ def nested_pandas_df() -> pd.DataFrame: df = df.set_index("rowindex") return df + @pytest.fixture(scope="session") def repeated_df( repeated_table_id: str, session: bigframes.Session @@ -422,9 +425,10 @@ def repeated_df( """Returns a DataFrame containing columns of list type.""" return session.read_gbq(repeated_table_id, index_col="rowindex") + @pytest.fixture(scope="session") def repeated_series( - repeated_df: bigframes.dataframe.DataFrame + repeated_df: bigframes.dataframe.DataFrame, ) -> bigframes.series.Series: """Returns a Series of lists""" return repeated_df["list_col"] @@ -441,11 +445,13 @@ def repeated_pandas_df() -> pd.DataFrame: df = df.set_index("rowindex") return df + @pytest.fixture(scope="session") def repeated_pandas_series(repeated_pandas_df: pd.DataFrame) -> pd.Series: """pd.DataFrame pointing at test data.""" return repeated_pandas_df["list_col"].astype(pd.ArrowDtype(pa.list_(pa.int64()))) + @pytest.fixture(scope="session") def scalars_df_default_index( scalars_df_index: bigframes.dataframe.DataFrame, diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 5146b8047e..1689c230d9 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -41,7 +41,13 @@ def test_getitem(key, repeated_series, repeated_pandas_series): bf_result = repeated_series.list[key].to_pandas() pd_result = repeated_pandas_series.list[key] - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False, check_names=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) @pytest.mark.parametrize( @@ -71,4 +77,10 @@ def test_len(repeated_series, repeated_pandas_series): bf_result = repeated_series.list.len().to_pandas() pd_result = repeated_pandas_series.list.len() - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False, check_names=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 56b08f96cb..ab238e3cbc 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -624,11 +624,11 @@ def test_getitem_w_string(scalars_dfs, index): ) def test_getitem_w_array(index, repeated_series, repeated_pandas_df): bf_result = repeated_series.str[index].to_pandas() - # We use repeated_pandas_df['list_col'] instead of repeated_pandas_series. - # Reason: The series fixture contains lists that are strongly-typed as PyArrow lists. - # Using str accessor on this type would fail the type check. However, the default + # We use repeated_pandas_df['list_col'] instead of repeated_pandas_series. + # Reason: The series fixture contains lists that are strongly-typed as PyArrow lists. + # Using str accessor on this type would fail the type check. However, the default # type for lists is Object in pandas, and it happily accepts str accessors. - pd_result = repeated_pandas_df['list_col'].str[index] + pd_result = repeated_pandas_df["list_col"].str[index] # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) From e7edcdf3ccd4528c922e7d7390f46ba191e2691d Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 16 Sep 2024 20:53:38 +0000 Subject: [PATCH 3/7] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/conftest.py | 8 +++++++- tests/system/small/operations/test_lists.py | 16 ++++++++++++++-- tests/system/small/operations/test_strings.py | 8 ++++---- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 99257c6c00..9d3e25e080 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -372,10 +372,12 @@ def scalars_table_tokyo(test_data_tables_tokyo) -> str: def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] + @pytest.fixture(scope="session") def repeated_table_id(test_data_tables) -> str: return test_data_tables["repeated"] + @pytest.fixture(scope="session") def penguins_table_id(test_data_tables) -> str: return test_data_tables["penguins"] @@ -415,6 +417,7 @@ def nested_pandas_df() -> pd.DataFrame: df = df.set_index("rowindex") return df + @pytest.fixture(scope="session") def repeated_df( repeated_table_id: str, session: bigframes.Session @@ -422,9 +425,10 @@ def repeated_df( """Returns a DataFrame containing columns of list type.""" return session.read_gbq(repeated_table_id, index_col="rowindex") + @pytest.fixture(scope="session") def repeated_series( - repeated_df: bigframes.dataframe.DataFrame + repeated_df: bigframes.dataframe.DataFrame, ) -> bigframes.series.Series: """Returns a Series of lists""" return repeated_df["list_col"] @@ -441,11 +445,13 @@ def repeated_pandas_df() -> pd.DataFrame: df = df.set_index("rowindex") return df + @pytest.fixture(scope="session") def repeated_pandas_series(repeated_pandas_df: pd.DataFrame) -> pd.Series: """pd.DataFrame pointing at test data.""" return repeated_pandas_df["list_col"].astype(pd.ArrowDtype(pa.list_(pa.int64()))) + @pytest.fixture(scope="session") def scalars_df_default_index( scalars_df_index: bigframes.dataframe.DataFrame, diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 5146b8047e..1689c230d9 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -41,7 +41,13 @@ def test_getitem(key, repeated_series, repeated_pandas_series): bf_result = repeated_series.list[key].to_pandas() pd_result = repeated_pandas_series.list[key] - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False, check_names=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) @pytest.mark.parametrize( @@ -71,4 +77,10 @@ def test_len(repeated_series, repeated_pandas_series): bf_result = repeated_series.list.len().to_pandas() pd_result = repeated_pandas_series.list.len() - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False, check_names=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 56b08f96cb..ab238e3cbc 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -624,11 +624,11 @@ def test_getitem_w_string(scalars_dfs, index): ) def test_getitem_w_array(index, repeated_series, repeated_pandas_df): bf_result = repeated_series.str[index].to_pandas() - # We use repeated_pandas_df['list_col'] instead of repeated_pandas_series. - # Reason: The series fixture contains lists that are strongly-typed as PyArrow lists. - # Using str accessor on this type would fail the type check. However, the default + # We use repeated_pandas_df['list_col'] instead of repeated_pandas_series. + # Reason: The series fixture contains lists that are strongly-typed as PyArrow lists. + # Using str accessor on this type would fail the type check. However, the default # type for lists is Object in pandas, and it happily accepts str accessors. - pd_result = repeated_pandas_df['list_col'].str[index] + pd_result = repeated_pandas_df["list_col"].str[index] # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) From 9a9d0ac3452dc7d3e27462a252abd54fe2d23866 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 22:53:23 +0000 Subject: [PATCH 4/7] add more type coverage in tests --- tests/data/repeated.jsonl | 7 ++-- tests/data/repeated_schema.json | 32 ++++++++++++++- tests/system/conftest.py | 14 ------- tests/system/small/operations/test_lists.py | 40 +++++++++++++++---- tests/system/small/operations/test_strings.py | 26 +++++++----- 5 files changed, 82 insertions(+), 37 deletions(-) diff --git a/tests/data/repeated.jsonl b/tests/data/repeated.jsonl index eb9c4317ac..b3c47772f6 100644 --- a/tests/data/repeated.jsonl +++ b/tests/data/repeated.jsonl @@ -1,4 +1,3 @@ -{"rowindex": 0, "list_col": [1]} -{"rowindex": 1, "list_col": [1,2]} -{"rowindex": 2, "list_col": [1,2,3]} -{"rowindex": 3, "list_col": [1,2,3,4]} +{"rowindex": 0, "int_list_col": [1], "bool_list_col": [true], "float_list_col": [1.2, 2.3], "date_list_col": ["2021-07-21"], "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]} +{"rowindex": 1, "int_list_col": [1,2], "bool_list_col": [true, false], "float_list_col": [1.1], "date_list_col": ["2021-07-21", "1987-03-28"], "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3], "string_list_col": ["a", "bc", "de"]} +{"rowindex": 2, "int_list_col": [1,2,3], "bool_list_col": [true], "float_list_col": [0.5, -1.9, 2.3], "date_list_col": ["2017-08-01", "2004-11-22"], "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7], "string_list_col": ["", "a"]} diff --git a/tests/data/repeated_schema.json b/tests/data/repeated_schema.json index e6f32bd9e1..300f32c994 100644 --- a/tests/data/repeated_schema.json +++ b/tests/data/repeated_schema.json @@ -5,8 +5,38 @@ "mode": "REQUIRED" }, { - "name": "list_col", + "name": "int_list_col", "type": "INTEGER", "mode": "REPEATED" + }, + { + "name": "bool_list_col", + "type": "BOOLEAN", + "mode": "REPEATED" + }, + { + "name": "float_list_col", + "type": "FLOAT", + "mode": "REPEATED" + }, + { + "name": "date_list_col", + "type": "DATE", + "mode": "REPEATED" + }, + { + "name": "date_time_list_col", + "type": "DATETIME", + "mode": "REPEATED" + }, + { + "name": "numeric_list_col", + "type": "NUMERIC", + "mode": "REPEATED" + }, + { + "name": "string_list_col", + "type": "STRING", + "mode": "REPEATED" } ] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 9d3e25e080..e7a85c5a65 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -426,14 +426,6 @@ def repeated_df( return session.read_gbq(repeated_table_id, index_col="rowindex") -@pytest.fixture(scope="session") -def repeated_series( - repeated_df: bigframes.dataframe.DataFrame, -) -> bigframes.series.Series: - """Returns a Series of lists""" - return repeated_df["list_col"] - - @pytest.fixture(scope="session") def repeated_pandas_df() -> pd.DataFrame: """Returns a DataFrame containing columns of list type.""" @@ -446,12 +438,6 @@ def repeated_pandas_df() -> pd.DataFrame: return df -@pytest.fixture(scope="session") -def repeated_pandas_series(repeated_pandas_df: pd.DataFrame) -> pd.Series: - """pd.DataFrame pointing at test data.""" - return repeated_pandas_df["list_col"].astype(pd.ArrowDtype(pa.list_(pa.int64()))) - - @pytest.fixture(scope="session") def scalars_df_default_index( scalars_df_index: bigframes.dataframe.DataFrame, diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 1689c230d9..06529743ed 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -32,14 +32,26 @@ pytest.param(slice(0, 2, None), id="default_step_slice"), ], ) -def test_getitem(key, repeated_series, repeated_pandas_series): +@pytest.mark.parametrize( + ("column_name", "dtype"), + [ + pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))), + pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))), + pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))), + pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))), + pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), + pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), + pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), + ] +) +def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - bf_result = repeated_series.list[key].to_pandas() - pd_result = repeated_pandas_series.list[key] + bf_result = repeated_df[column_name].list[key].to_pandas() + pd_result = repeated_pandas_df[column_name].astype(dtype).list[key] assert_series_equal( pd_result, @@ -63,19 +75,31 @@ def test_getitem(key, repeated_series, repeated_pandas_series): (slice(0, 2, 2), pytest.raises(NotImplementedError)), ], ) -def test_getitem_notsupported(key, expectation, repeated_series): +def test_getitem_notsupported(key, expectation, repeated_df): with expectation as e: - assert repeated_series.list[key] == e + assert repeated_df['int_list_col'].list[key] == e -def test_len(repeated_series, repeated_pandas_series): +@pytest.mark.parametrize( + ("column_name", "dtype"), + [ + pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))), + pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))), + pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))), + pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))), + pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), + pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), + pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), + ] +) +def test_len(column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - bf_result = repeated_series.list.len().to_pandas() - pd_result = repeated_pandas_series.list.len() + bf_result = repeated_df[column_name].list.len().to_pandas() + pd_result = repeated_pandas_df[column_name].astype(dtype).list.len() assert_series_equal( pd_result, diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index ab238e3cbc..568dcc416a 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -615,22 +615,28 @@ def test_getitem_w_string(scalars_dfs, index): @pytest.mark.parametrize( ("index"), [ - pytest.param(2, id="int"), + pytest.param(0, id="int"), pytest.param(slice(None, None, None), id="default_start_slice"), pytest.param(slice(0, None, 1), id="default_stop_slice"), pytest.param(slice(0, 2, None), id="default_step_slice"), pytest.param(slice(0, 0, None), id="single_one_slice"), ], ) -def test_getitem_w_array(index, repeated_series, repeated_pandas_df): - bf_result = repeated_series.str[index].to_pandas() - # We use repeated_pandas_df['list_col'] instead of repeated_pandas_series. - # Reason: The series fixture contains lists that are strongly-typed as PyArrow lists. - # Using str accessor on this type would fail the type check. However, the default - # type for lists is Object in pandas, and it happily accepts str accessors. - pd_result = repeated_pandas_df["list_col"].str[index] - - # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. +@pytest.mark.parametrize( + "column_name", + [ + pytest.param("int_list_col"), + pytest.param("bool_list_col"), + pytest.param("float_list_col"), + pytest.param("string_list_col"), + # date, date_time and numeric are excluded because their default types are different + # in Pandas and BigFrames + ] +) +def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df): + bf_result = repeated_df[column_name].str[index].to_pandas() + pd_result = repeated_pandas_df[column_name].str[index] + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) From f27dda7548ba8a4dec3b878bc657a6b4965dbfc1 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 22:54:00 +0000 Subject: [PATCH 5/7] fix format --- tests/system/small/operations/test_lists.py | 6 +++--- tests/system/small/operations/test_strings.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 06529743ed..4b012d9ffa 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -42,7 +42,7 @@ pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), - ] + ], ) def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): @@ -77,7 +77,7 @@ def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df): ) def test_getitem_notsupported(key, expectation, repeated_df): with expectation as e: - assert repeated_df['int_list_col'].list[key] == e + assert repeated_df["int_list_col"].list[key] == e @pytest.mark.parametrize( @@ -90,7 +90,7 @@ def test_getitem_notsupported(key, expectation, repeated_df): pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), - ] + ], ) def test_len(column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 568dcc416a..98fecaa93b 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -631,7 +631,7 @@ def test_getitem_w_string(scalars_dfs, index): pytest.param("string_list_col"), # date, date_time and numeric are excluded because their default types are different # in Pandas and BigFrames - ] + ], ) def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df): bf_result = repeated_df[column_name].str[index].to_pandas() From 2c0eeb329536e7015384e1fa693524f5f3d8754d Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 22:56:17 +0000 Subject: [PATCH 6/7] remove unnecessary dep --- tests/system/small/operations/test_lists.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 4b012d9ffa..7b39bdebd5 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -18,8 +18,6 @@ import pyarrow as pa import pytest -import bigframes.pandas as bpd - from ...utils import assert_series_equal From 40ccad90fc2032d33f43868e31dd7fe955ca3b57 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 22:59:43 +0000 Subject: [PATCH 7/7] remove import --- tests/system/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index e7a85c5a65..9cfb9082af 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -32,7 +32,6 @@ import ibis.backends import numpy as np import pandas as pd -import pyarrow as pa import pytest import pytz import test_utils.prefixer