diff --git a/tests/data/repeated.jsonl b/tests/data/repeated.jsonl new file mode 100644 index 0000000000..b3c47772f6 --- /dev/null +++ b/tests/data/repeated.jsonl @@ -0,0 +1,3 @@ +{"rowindex": 0, "int_list_col": [1], "bool_list_col": [true], "float_list_col": [1.2, 2.3], "date_list_col": ["2021-07-21"], "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]} +{"rowindex": 1, "int_list_col": [1,2], "bool_list_col": [true, false], "float_list_col": [1.1], "date_list_col": ["2021-07-21", "1987-03-28"], "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3], "string_list_col": ["a", "bc", "de"]} +{"rowindex": 2, "int_list_col": [1,2,3], "bool_list_col": [true], "float_list_col": [0.5, -1.9, 2.3], "date_list_col": ["2017-08-01", "2004-11-22"], "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7], "string_list_col": ["", "a"]} diff --git a/tests/data/repeated_schema.json b/tests/data/repeated_schema.json new file mode 100644 index 0000000000..300f32c994 --- /dev/null +++ b/tests/data/repeated_schema.json @@ -0,0 +1,42 @@ +[ + { + "name": "rowindex", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "int_list_col", + "type": "INTEGER", + "mode": "REPEATED" + }, + { + "name": "bool_list_col", + "type": "BOOLEAN", + "mode": "REPEATED" + }, + { + "name": "float_list_col", + "type": "FLOAT", + "mode": "REPEATED" + }, + { + "name": "date_list_col", + "type": "DATE", + "mode": "REPEATED" + }, + { + "name": "date_time_list_col", + "type": "DATETIME", + "mode": "REPEATED" + }, + { + "name": "numeric_list_col", + "type": "NUMERIC", + "mode": "REPEATED" + }, + { + "name": "string_list_col", + "type": "STRING", + "mode": "REPEATED" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 49cd887cfd..9cfb9082af 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -39,6 +39,7 @@ import bigframes import bigframes.dataframe import bigframes.pandas as bpd +import bigframes.series import tests.system.utils # Use this to control the number of cloud functions being deleted in a single @@ -294,6 +295,7 @@ def load_test_data_tables( ("scalars", "scalars_schema.json", "scalars.jsonl"), ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("nested", "nested_schema.json", "nested.jsonl"), + ("repeated", "repeated_schema.json", "repeated.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), ("hockey_players", "hockey_players.json", "hockey_players.jsonl"), @@ -370,6 +372,11 @@ def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] +@pytest.fixture(scope="session") +def repeated_table_id(test_data_tables) -> str: + return test_data_tables["repeated"] + + @pytest.fixture(scope="session") def penguins_table_id(test_data_tables) -> str: return test_data_tables["penguins"] @@ -410,6 +417,26 @@ def nested_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def repeated_df( + repeated_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """Returns a DataFrame containing columns of list type.""" + return session.read_gbq(repeated_table_id, index_col="rowindex") + + +@pytest.fixture(scope="session") +def repeated_pandas_df() -> pd.DataFrame: + """Returns a DataFrame containing columns of list type.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + df = df.set_index("rowindex") + return df + + @pytest.fixture(scope="session") def scalars_df_default_index( scalars_df_index: bigframes.dataframe.DataFrame, diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 7ecf79dc6a..7b39bdebd5 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -18,8 +18,6 @@ import pyarrow as pa import pytest -import bigframes.pandas as bpd - from ...utils import assert_series_equal @@ -32,19 +30,34 @@ pytest.param(slice(0, 2, None), id="default_step_slice"), ], ) -def test_getitem(key): +@pytest.mark.parametrize( + ("column_name", "dtype"), + [ + pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))), + pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))), + pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))), + pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))), + pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), + pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), + pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), + ], +) +def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - data = [[1], [2, 3], [4, 5, 6]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - bf_result = s.list[key].to_pandas() - pd_result = pd_s.list[key] + bf_result = repeated_df[column_name].list[key].to_pandas() + pd_result = repeated_pandas_df[column_name].astype(dtype).list[key] - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) @pytest.mark.parametrize( @@ -60,24 +73,36 @@ def test_getitem(key): (slice(0, 2, 2), pytest.raises(NotImplementedError)), ], ) -def test_getitem_notsupported(key, expectation): - data = [[1], [2, 3], [4, 5, 6]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - +def test_getitem_notsupported(key, expectation, repeated_df): with expectation as e: - assert s.list[key] == e + assert repeated_df["int_list_col"].list[key] == e -def test_len(): +@pytest.mark.parametrize( + ("column_name", "dtype"), + [ + pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))), + pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))), + pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))), + pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))), + pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), + pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), + pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), + ], +) +def test_len(column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - data = [[], [1], [1, 2], [1, 2, 3]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - bf_result = s.list.len().to_pandas() - pd_result = pd_s.list.len() + bf_result = repeated_df[column_name].list.len().to_pandas() + pd_result = repeated_pandas_df[column_name].astype(dtype).list.len() - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 15e8512317..98fecaa93b 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -615,21 +615,28 @@ def test_getitem_w_string(scalars_dfs, index): @pytest.mark.parametrize( ("index"), [ - pytest.param(2, id="int"), + pytest.param(0, id="int"), pytest.param(slice(None, None, None), id="default_start_slice"), pytest.param(slice(0, None, 1), id="default_stop_slice"), pytest.param(slice(0, 2, None), id="default_step_slice"), pytest.param(slice(0, 0, None), id="single_one_slice"), ], ) -def test_getitem_w_array(index): - data = [[1], [2, 3], [], [4, 5, 6]] - s = bpd.Series(data) - pd_s = pd.Series(data) - - bf_result = s.str[index].to_pandas() - pd_result = pd_s.str[index] - # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. +@pytest.mark.parametrize( + "column_name", + [ + pytest.param("int_list_col"), + pytest.param("bool_list_col"), + pytest.param("float_list_col"), + pytest.param("string_list_col"), + # date, date_time and numeric are excluded because their default types are different + # in Pandas and BigFrames + ], +) +def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df): + bf_result = repeated_df[column_name].str[index].to_pandas() + pd_result = repeated_pandas_df[column_name].str[index] + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)