From eb496d999db872fb254c199ddca26171d75d67a0 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Wed, 19 Mar 2025 13:15:37 -0700 Subject: [PATCH 01/39] chore: remove unused import in function.py (#1502) --- bigframes/functions/function.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index fd2f512f97..30b3d23056 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -35,7 +35,6 @@ import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting -import bigframes.functions.function_template from . import _function_session as bff_session from . import _utils From a9cf215fb1403fda4ab2b58252f5fedc33aba3e1 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 19 Mar 2025 14:20:07 -0700 Subject: [PATCH 02/39] feat: Allow iloc to support lists of negative indices (#1497) * feat: support iloc with negative indices * update partial ordering test * update naming * update logic * update comment * update logic and tests * update filter --- bigframes/core/indexers.py | 36 +++++++++++++++++++++++++++- tests/system/small/test_dataframe.py | 24 ++++++++++++++----- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index d1a0c42e97..6258eb00d5 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -27,6 +27,7 @@ import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar +import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions as bfe @@ -477,6 +478,19 @@ def _iloc_getitem_series_or_dataframe( Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) + + # Check if both positive index and negative index are necessary + if isinstance(key, (bigframes.series.Series, indexes.Index)): + # Avoid data download + is_key_unisigned = False + else: + first_sign = key[0] >= 0 + is_key_unisigned = True + for k in key: + if (k >= 0) != first_sign: + is_key_unisigned = False + break + if isinstance(series_or_dataframe, bigframes.series.Series): original_series_name = series_or_dataframe.name series_name = ( @@ -497,7 +511,27 @@ def _iloc_getitem_series_or_dataframe( block = df._block # explicitly set index to offsets, reset_index may not generate offsets in some modes block, offsets_id = block.promote_offsets("temp_iloc_offsets_") - block = block.set_index([offsets_id]) + pos_block = block.set_index([offsets_id]) + + if not is_key_unisigned or key[0] < 0: + neg_block, size_col_id = block.apply_window_op( + offsets_id, + ops.aggregations.SizeUnaryOp(), + window_spec=windows.rows(), + ) + neg_block, neg_index_id = neg_block.apply_binary_op( + offsets_id, size_col_id, ops.SubOp() + ) + + neg_block = neg_block.set_index([neg_index_id]).drop_columns( + [size_col_id, offsets_id] + ) + + if is_key_unisigned: + block = pos_block if key[0] >= 0 else neg_block + else: + block = pos_block.concat([neg_block], how="inner") + df = bigframes.dataframe.DataFrame(block) result = df.loc[key] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b97846d992..c2e4a1c8ad 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4400,9 +4400,15 @@ def test_loc_list_multiindex(scalars_dfs_maybe_ordered): ) -def test_iloc_list(scalars_df_index, scalars_pandas_df_index): - index_list = [0, 0, 0, 5, 4, 7] - +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] @@ -4412,11 +4418,17 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) def test_iloc_list_partial_ordering( - scalars_df_partial_ordering, scalars_pandas_df_index + scalars_df_partial_ordering, scalars_pandas_df_index, index_list ): - index_list = [0, 0, 0, 5, 4, 7] - bf_result = scalars_df_partial_ordering.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] From 75fc7e0268dc5b10bdbc33dcf28db97dce62e41c Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 19 Mar 2025 15:11:34 -0700 Subject: [PATCH 03/39] feat: Support dry_run in `to_pandas()` (#1436) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Support dry_run in * centralize dry_run logics at block level * fix lint errors * remove unnecessary code * use dataframe for dry_run stats * flatten the job stats to a series * fix lint * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix query job issue * Make pandas surface directly call block._compute_dry_run * type hint update --------- Co-authored-by: Owl Bot --- bigframes/core/blocks.py | 120 ++++++++++++++++++++---- bigframes/core/indexes/base.py | 44 +++++++-- bigframes/dataframe.py | 51 +++++++++- bigframes/series.py | 27 +++++- tests/system/small/test_dataframe_io.py | 8 ++ tests/system/small/test_index.py | 9 ++ tests/system/small/test_series.py | 8 ++ tests/unit/core/test_blocks.py | 8 ++ 8 files changed, 243 insertions(+), 32 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index db6007b41a..2992718412 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -22,6 +22,7 @@ from __future__ import annotations import ast +import copy import dataclasses import datetime import functools @@ -30,6 +31,7 @@ import textwrap import typing from typing import ( + Any, Iterable, List, Literal, @@ -49,7 +51,7 @@ import pyarrow as pa from bigframes import session -import bigframes._config.sampling_options as sampling_options +from bigframes._config import sampling_options import bigframes.constants import bigframes.core as core import bigframes.core.compile.googlesql as googlesql @@ -535,19 +537,9 @@ def to_pandas( Returns: pandas.DataFrame, QueryJob """ - if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) - - sampling = bigframes.options.sampling.with_max_download_size(max_download_size) - if sampling_method is not None: - sampling = sampling.with_method(sampling_method).with_random_state( # type: ignore - random_state - ) - else: - sampling = sampling.with_disabled() + sampling = self._get_sampling_option( + max_download_size, sampling_method, random_state + ) df, query_job = self._materialize_local( materialize_options=MaterializationOptions( @@ -559,6 +551,27 @@ def to_pandas( df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job + def _get_sampling_option( + self, + max_download_size: Optional[int] = None, + sampling_method: Optional[str] = None, + random_state: Optional[int] = None, + ) -> sampling_options.SamplingOptions: + + if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + + sampling = bigframes.options.sampling.with_max_download_size(max_download_size) + if sampling_method is None: + return sampling.with_disabled() + + return sampling.with_method(sampling_method).with_random_state( # type: ignore + random_state + ) + def try_peek( self, n: int = 20, force: bool = False, allow_large_results=None ) -> typing.Optional[pd.DataFrame]: @@ -798,11 +811,73 @@ def split( return [sliced_block.drop_columns(drop_cols) for sliced_block in sliced_blocks] def _compute_dry_run( - self, value_keys: Optional[Iterable[str]] = None - ) -> bigquery.QueryJob: + self, + value_keys: Optional[Iterable[str]] = None, + *, + ordered: bool = True, + max_download_size: Optional[int] = None, + sampling_method: Optional[str] = None, + random_state: Optional[int] = None, + ) -> typing.Tuple[pd.Series, bigquery.QueryJob]: + sampling = self._get_sampling_option( + max_download_size, sampling_method, random_state + ) + if sampling.enable_downsampling: + raise NotImplementedError("Dry run with sampling is not supported") + + index: List[Any] = [] + values: List[Any] = [] + + index.append("columnCount") + values.append(len(self.value_columns)) + index.append("columnDtypes") + values.append( + { + col: self.expr.get_column_type(self.resolve_label_exact_or_error(col)) + for col in self.column_labels + } + ) + + index.append("indexLevel") + values.append(self.index.nlevels) + index.append("indexDtypes") + values.append(self.index.dtypes) + expr = self._apply_value_keys_to_expr(value_keys=value_keys) - query_job = self.session._executor.dry_run(expr) - return query_job + query_job = self.session._executor.dry_run(expr, ordered) + job_api_repr = copy.deepcopy(query_job._properties) + + job_ref = job_api_repr["jobReference"] + for key, val in job_ref.items(): + index.append(key) + values.append(val) + + index.append("jobType") + values.append(job_api_repr["configuration"]["jobType"]) + + query_config = job_api_repr["configuration"]["query"] + for key in ("destinationTable", "useLegacySql"): + index.append(key) + values.append(query_config.get(key)) + + query_stats = job_api_repr["statistics"]["query"] + for key in ( + "referencedTables", + "totalBytesProcessed", + "cacheHit", + "statementType", + ): + index.append(key) + values.append(query_stats.get(key)) + + index.append("creationTime") + values.append( + pd.Timestamp( + job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC" + ) + ) + + return pd.Series(values, index=index), query_job def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): expr = self._expr @@ -2703,11 +2778,18 @@ def to_pandas( "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." ) ordered = ordered if ordered is not None else True + df, query_job = self._block.select_columns([]).to_pandas( - ordered=ordered, allow_large_results=allow_large_results + ordered=ordered, + allow_large_results=allow_large_results, ) return df.index, query_job + def _compute_dry_run( + self, *, ordered: bool = True + ) -> Tuple[pd.Series, bigquery.QueryJob]: + return self._block.select_columns([])._compute_dry_run(ordered=ordered) + def resolve_level(self, level: LevelsType) -> typing.Sequence[str]: if utils.is_list_like(level): levels = list(level) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 84da6c5de0..900825996e 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -17,7 +17,7 @@ from __future__ import annotations import typing -from typing import Hashable, Literal, Optional, Sequence, Union +from typing import Hashable, Literal, Optional, overload, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index @@ -228,7 +228,7 @@ def T(self) -> Index: return self.transpose() @property - def query_job(self) -> Optional[bigquery.QueryJob]: + def query_job(self) -> bigquery.QueryJob: """BigQuery job metadata for the most recent query. Returns: @@ -236,7 +236,8 @@ def query_job(self) -> Optional[bigquery.QueryJob]: `_. """ if self._query_job is None: - self._query_job = self._block._compute_dry_run() + _, query_job = self._block._compute_dry_run() + self._query_job = query_job return self._query_job def __repr__(self) -> str: @@ -252,7 +253,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows if opts.repr_mode == "deferred": - return formatter.repr_query_job(self._block._compute_dry_run()) + _, dry_run_query_job = self._block._compute_dry_run() + return formatter.repr_query_job(dry_run_query_job) pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._query_job = query_job @@ -490,18 +492,46 @@ def __getitem__(self, key: int) -> typing.Any: else: raise NotImplementedError(f"Index key not supported {key}") - def to_pandas(self, *, allow_large_results: Optional[bool] = None) -> pandas.Index: + @overload + def to_pandas( + self, + *, + allow_large_results: Optional[bool] = ..., + dry_run: Literal[False] = ..., + ) -> pandas.Index: + ... + + @overload + def to_pandas( + self, *, allow_large_results: Optional[bool] = ..., dry_run: Literal[True] = ... + ) -> pandas.Series: + ... + + def to_pandas( + self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False + ) -> pandas.Index | pandas.Series: """Gets the Index as a pandas Index. Args: allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. + dry_run (bool, default False): + If this argument is true, this method will not process the data. Instead, it returns + a Pandas series containing dtype and the amount of bytes to be processed. Returns: - pandas.Index: - A pandas Index with all of the labels from this Index. + pandas.Index | pandas.Series: + A pandas Index with all of the labels from this Index. If dry run is set to True, + returns a Series containing dry run statistics. """ + if dry_run: + dry_run_stats, dry_run_job = self._block.index._compute_dry_run( + ordered=True + ) + self._query_job = dry_run_job + return dry_run_stats + df, query_job = self._block.index.to_pandas( ordered=True, allow_large_results=allow_large_results ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index abab9fd268..6d1425617c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -32,6 +32,7 @@ Literal, Mapping, Optional, + overload, Sequence, Tuple, Union, @@ -1594,6 +1595,32 @@ def to_arrow( self._set_internal_query_job(query_job) return pa_table + @overload + def to_pandas( + self, + max_download_size: Optional[int] = ..., + sampling_method: Optional[str] = ..., + random_state: Optional[int] = ..., + *, + ordered: bool = ..., + dry_run: Literal[False] = ..., + allow_large_results: Optional[bool] = ..., + ) -> pandas.DataFrame: + ... + + @overload + def to_pandas( + self, + max_download_size: Optional[int] = ..., + sampling_method: Optional[str] = ..., + random_state: Optional[int] = ..., + *, + ordered: bool = ..., + dry_run: Literal[True] = ..., + allow_large_results: Optional[bool] = ..., + ) -> pandas.Series: + ... + def to_pandas( self, max_download_size: Optional[int] = None, @@ -1601,8 +1628,9 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + dry_run: bool = False, allow_large_results: Optional[bool] = None, - ) -> pandas.DataFrame: + ) -> pandas.DataFrame | pandas.Series: """Write DataFrame to pandas DataFrame. Args: @@ -1624,6 +1652,9 @@ def to_pandas( ordered (bool, default True): Determines whether the resulting pandas dataframe will be ordered. In some cases, unordered may result in a faster-executing query. + dry_run (bool, default False): + If this argument is true, this method will not process the data. Instead, it returns + a Pandas Series containing dry run statistics allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. @@ -1631,9 +1662,22 @@ def to_pandas( Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the data_sampling_threshold_mb is not exceeded; otherwise, a pandas DataFrame with - downsampled rows and all columns of this DataFrame. + downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas + Series containing dry run statistics will be returned. """ + # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job + + if dry_run: + dry_run_stats, dry_run_job = self._block._compute_dry_run( + max_download_size=max_download_size, + sampling_method=sampling_method, + random_state=random_state, + ordered=ordered, + ) + self._set_internal_query_job(dry_run_job) + return dry_run_stats + df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -1679,7 +1723,8 @@ def to_pandas_batches( ) def _compute_dry_run(self) -> bigquery.QueryJob: - return self._block._compute_dry_run() + _, query_job = self._block._compute_dry_run() + return query_job def copy(self) -> DataFrame: return DataFrame(self._block) diff --git a/bigframes/series.py b/bigframes/series.py index 34ac3c3de9..b86b1ca6db 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -381,6 +381,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + dry_run: bool = False, allow_large_results: Optional[bool] = None, ) -> pandas.Series: """Writes Series to pandas Series. @@ -404,15 +405,32 @@ def to_pandas( ordered (bool, default True): Determines whether the resulting pandas series will be ordered. In some cases, unordered may result in a faster-executing query. + dry_run (bool, default False): + If this argument is true, this method will not process the data. Instead, it returns + a Pandas Series containing dry run job statistics allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. - Returns: pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb - is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. + is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run + is set to True, a pandas Series containing dry run statistics will be returned. """ + + if dry_run: + dry_run_stats, dry_run_job = self._block._compute_dry_run( + max_download_size=max_download_size, + sampling_method=sampling_method, + random_state=random_state, + ordered=ordered, + ) + + self._set_internal_query_job(dry_run_job) + return dry_run_stats + + # Repeat the to_pandas() call to make mypy deduce type correctly, because mypy cannot resolve + # Literal[True/False] to bool df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -420,14 +438,17 @@ def to_pandas( ordered=ordered, allow_large_results=allow_large_results, ) + if query_job: self._set_internal_query_job(query_job) + series = df.squeeze(axis=1) series.name = self._name return series def _compute_dry_run(self) -> bigquery.QueryJob: - return self._block._compute_dry_run((self._value_column,)) + _, query_job = self._block._compute_dry_run((self._value_column,)) + return query_job def drop( self, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 30a78b5cdc..d2ba96b41f 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -927,3 +927,11 @@ def test_to_sql_query_named_index_excluded( utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) + + +def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): + bf_df = session.read_pandas(scalars_pandas_df_multi_index) + + result = bf_df.to_pandas(dry_run=True) + + assert len(result) == 14 diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 4d01bc5ee9..535e4bc9ae 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -24,6 +24,7 @@ def test_index_construct_from_list(): bf_result = bpd.Index( [3, 14, 159], dtype=pd.Int64Dtype(), name="my_index" ).to_pandas() + pd_result: pd.Index = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") pd.testing.assert_index_equal(bf_result, pd_result) @@ -425,3 +426,11 @@ def test_multiindex_repr_includes_all_names(session): ) index = session.read_pandas(df).set_index(["A", "B"]).index assert "names=['A', 'B']" in repr(index) + + +def test_to_pandas_dry_run(scalars_df_index): + index = scalars_df_index.index + + result = index.to_pandas(dry_run=True) + + assert len(result) == 14 diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index ef544b0a0b..f1f060caa4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4384,3 +4384,11 @@ def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): series = nested_structs_df["person"] assert series.name == "person" + + +def test_series_to_pandas_dry_run(scalars_df_index): + bf_series = scalars_df_index["int64_col"] + + result = bf_series.to_pandas(dry_run=True) + + assert len(result) == 14 diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index 8ed3acba0f..fb5a927e76 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -92,3 +92,11 @@ def test_block_from_local(data): pandas.testing.assert_index_equal(block.column_labels, expected.columns) assert tuple(block.index.names) == tuple(expected.index.names) assert block.shape == expected.shape + + +def test_block_compute_dry_run__raises_error_when_sampling_is_enabled(): + mock_session = mock.create_autospec(spec=bigframes.Session) + block = blocks.Block.from_local(pandas.DataFrame(), mock_session) + + with pytest.raises(NotImplementedError): + block._compute_dry_run(sampling_method="UNIFORM") From 7d00be67cf50fdf713c40912f207d14f0f65538f Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 19 Mar 2025 15:48:09 -0700 Subject: [PATCH 04/39] feat: add df.take and series.take (#1509) * feat: add df.take and series.take * update --- bigframes/dataframe.py | 12 ++++++++++ bigframes/series.py | 7 ++++++ tests/system/small/test_dataframe.py | 18 +++++++++++++++ tests/system/small/test_series.py | 17 ++++++++++++++ .../bigframes_vendored/pandas/core/generic.py | 23 +++++++++++++++++++ 5 files changed, 77 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6d1425617c..38f663f56f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2219,6 +2219,18 @@ def add_suffix(self, suffix: str, axis: int | str | None = None) -> DataFrame: axis = 1 if axis is None else axis return DataFrame(self._get_block().add_suffix(suffix, axis)) + def take( + self, indices: typing.Sequence[int], axis: int | str | None = 0, **kwargs + ) -> DataFrame: + if not utils.is_list_like(indices): + raise ValueError("indices should be a list-like object.") + if axis == 0 or axis == "index": + return self.iloc[indices] + elif axis == 1 or axis == "columns": + return self.iloc[:, indices] + else: + raise ValueError(f"No axis named {axis} for object type DataFrame") + def filter( self, items: typing.Optional[typing.Iterable] = None, diff --git a/bigframes/series.py b/bigframes/series.py index b86b1ca6db..5f49daa07d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1652,6 +1652,13 @@ def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: def add_suffix(self, suffix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_suffix(suffix)) + def take( + self, indices: typing.Sequence[int], axis: int | str | None = 0, **kwargs + ) -> Series: + if not utils.is_list_like(indices): + raise ValueError("indices should be a list-like object.") + return typing.cast(Series, self.iloc[indices]) + def filter( self, items: typing.Optional[typing.Iterable] = None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index c2e4a1c8ad..8cc3be1577 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -807,6 +807,24 @@ def test_get_df_column_name_duplicate(scalars_dfs): pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) +@pytest.mark.parametrize( + ("indices", "axis"), + [ + ([1, 3, 5], 0), + ([2, 4, 6], 1), + ([1, -3, -5, -6], "index"), + ([-2, -4, -6], "columns"), + ], +) +def test_take_df(scalars_dfs, indices, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices, axis=axis).to_pandas() + pd_result = scalars_pandas_df.take(indices, axis=axis) + + assert_pandas_df_equal(bf_result, pd_result) + + def test_filter_df(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f1f060caa4..acd267aaf8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1543,6 +1543,23 @@ def test_indexing_using_selected_series(scalars_dfs): ) +@pytest.mark.parametrize( + ("indices"), + [ + ([1, 3, 5]), + ([5, -3, -5, -6]), + ([-2, -4, -6]), + ], +) +def test_take(scalars_dfs, indices): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices).to_pandas() + pd_result = scalars_pandas_df.take(indices) + + assert_pandas_df_equal(bf_result, pd_result) + + def test_nested_filter(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs string_col = scalars_df["string_col"] diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index ee35bfa429..8dd43fd8da 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -910,6 +910,29 @@ def notna(self) -> NDFrame: notnull = notna + def take(self, indices, axis=0, **kwargs) -> NDFrame: + """Return the elements in the given positional indices along an axis. + + This means that we are not indexing according to actual values in the index + attribute of the object. We are indexing according to the actual position of + the element in the object. + + Args: + indices (list-like): + An array of ints indicating which positions to take. + axis ({0 or 'index', 1 or 'columns', None}, default 0): + The axis on which to select elements. 0 means that we are selecting rows, + 1 means that we are selecting columns. For Series this parameter is + unused and defaults to 0. + **kwargs: + For compatibility with numpy.take(). Has no effect on the output. + + Returns: + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Same type as input object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def filter( self, items=None, From 42917f7453363851eb0931c0928b43e11efa0483 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 20 Mar 2025 12:20:00 -0700 Subject: [PATCH 05/39] chore: Ignore mypy false positive (#1515) --- bigframes/core/indexes/base.py | 2 +- bigframes/dataframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 900825996e..a90e0a8705 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -493,7 +493,7 @@ def __getitem__(self, key: int) -> typing.Any: raise NotImplementedError(f"Index key not supported {key}") @overload - def to_pandas( + def to_pandas( # type: ignore[overload-overlap] self, *, allow_large_results: Optional[bool] = ..., diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 38f663f56f..4955bb1295 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1596,7 +1596,7 @@ def to_arrow( return pa_table @overload - def to_pandas( + def to_pandas( # type: ignore[overload-overlap] self, max_download_size: Optional[int] = ..., sampling_method: Optional[str] = ..., From dcfbb63c996229b0fa2b439769021837223b577b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 20 Mar 2025 13:00:34 -0700 Subject: [PATCH 06/39] chore: runs 'pip freeze' in system tests for debugging (#1504) --- noxfile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/noxfile.py b/noxfile.py index 74538f49f0..77b32ab15d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -342,6 +342,9 @@ def run_system( install_systemtest_dependencies(session, install_test_extra, "-c", constraints_path) + # Print out package versions for debugging. + session.run("python", "-m", "pip", "freeze") + # Run py.test against the system tests. pytest_cmd = [ "py.test", From a70a607512797463f70ed529f078fcb2d40c85a1 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 20 Mar 2025 13:26:44 -0700 Subject: [PATCH 07/39] fix: change the default value for pdf extract/chunk (#1517) --- bigframes/operations/blob.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 5e786f8d22..ab529c1312 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -560,9 +560,9 @@ def pdf_extract( self, *, connection: Optional[str] = None, - max_batching_rows: int = 8192, - container_cpu: Union[float, int] = 0.33, - container_memory: str = "512Mi", + max_batching_rows: int = 1, + container_cpu: Union[float, int] = 2, + container_memory: str = "1Gi", ) -> bigframes.series.Series: """Extracts text from PDF URLs and saves the text as string. @@ -574,10 +574,10 @@ def pdf_extract( connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 8,192): Max number of rows per batch + max_batching_rows (int, default 1): Max number of rows per batch send to cloud run to execute the function. - container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. - container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. + container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "1Gi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: bigframes.series.Series: conatins all text from a pdf file @@ -604,11 +604,11 @@ def pdf_chunk( self, *, connection: Optional[str] = None, - chunk_size: int = 1000, + chunk_size: int = 2000, overlap_size: int = 200, - max_batching_rows: int = 8192, - container_cpu: Union[float, int] = 0.33, - container_memory: str = "512Mi", + max_batching_rows: int = 1, + container_cpu: Union[float, int] = 2, + container_memory: str = "1Gi", ) -> bigframes.series.Series: """Extracts and chunks text from PDF URLs and saves the text as arrays of strings. @@ -620,15 +620,15 @@ def pdf_chunk( connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - chunk_size (int, default 1000): the desired size of each text chunk + chunk_size (int, default 2000): the desired size of each text chunk (number of characters). overlap_size (int, default 200): the number of overlapping characters between consective chunks. The helps to ensure context is perserved across chunk boundaries. - max_batching_rows (int, default 8,192): Max number of rows per batch + max_batching_rows (int, default 1): Max number of rows per batch send to cloud run to execute the function. - container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. - container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. + container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "1Gi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: bigframe.series.Series: Series of array[str], where each string is a From 6681f1f9e30ed2325b85668de8a0b1d3d0e2858b Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 20 Mar 2025 13:53:28 -0700 Subject: [PATCH 08/39] fix: tolerate BQ connection service account propagation delay (#1505) --- bigframes/clients.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/bigframes/clients.py b/bigframes/clients.py index c6e1d47909..1b8212377d 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -94,16 +94,24 @@ def create_bq_connection( # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function self._ensure_iam_binding(project_id, service_account_id, iam_role) - # Introduce retries to accommodate transient errors like etag mismatch, - # which can be caused by concurrent operation on the same resource, and - # manifests with message like: - # google.api_core.exceptions.Aborted: 409 There were concurrent policy - # changes. Please retry the whole read-modify-write with exponential - # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match - # the current policy's ETag '\007\006\003,\3750&\363'. + # Introduce retries to accommodate transient errors like: + # (1) Etag mismatch, + # which can be caused by concurrent operation on the same resource, and + # manifests with message like: + # google.api_core.exceptions.Aborted: 409 There were concurrent policy + # changes. Please retry the whole read-modify-write with exponential + # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not + # match the current policy's ETag '\007\006\003,\3750&\363'. + # (2) Connection creation, + # for which sometimes it takes a bit for its service account to reflect + # across APIs (e.g. b/397662004, b/386838767), before which, an attempt + # to set an IAM policy for the service account may throw an error like: + # google.api_core.exceptions.InvalidArgument: 400 Service account + # bqcx-*@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not exist. @google.api_core.retry.Retry( predicate=google.api_core.retry.if_exception_type( - google.api_core.exceptions.Aborted + google.api_core.exceptions.Aborted, + google.api_core.exceptions.InvalidArgument, ), initial=10, maximum=20, From 2818ab9e19060b24bef2bab5dcc6fda333e687c6 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 20 Mar 2025 14:06:57 -0700 Subject: [PATCH 09/39] chore: Covid notebook uses partial ordering mode (#1518) --- .../bq_dataframes_covid_line_graphs.ipynb | 75 +++++++++++-------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb b/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb index c3b4c8e616..b3ae35f013 100644 --- a/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb +++ b/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": { "id": "9GIt_orUtNvA" }, @@ -135,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "metadata": { "id": "4aooKMmnxrWF" }, @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "metadata": { "id": "bk03Rt_HyGx-" }, @@ -206,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "metadata": { "id": "R7STCS8xB5d2" }, @@ -222,7 +222,9 @@ "# It defaults to the location of the first table or query\n", "# passed to read_gbq(). For APIs where a location can't be\n", "# auto-detected, the location defaults to the \"US\" location.\n", - "bpd.options.bigquery.location = REGION" + "bpd.options.bigquery.location = REGION\n", + "# Improves performance by avoiding generating total row ordering\n", + "bpd.options.bigquery.ordering_mode = \"partial\"" ] }, { @@ -236,11 +238,23 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": { "id": "zDSwoBo1CU3G" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/tbergeron/src/bigframes/venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:3579: UserWarning: Reading cached table from 2025-03-20 20:22:07.633084+00:00 to avoid\n", + "incompatibilies with previous reads of this table. To read the latest\n", + "version, set `use_cache=False` or close the current session with\n", + "Session.close() or bigframes.pandas.close_session().\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + } + ], "source": [ "all_data = bpd.read_gbq(\"bigquery-public-data.covid19_open_data.covid19_open_data\")" ] @@ -256,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "metadata": { "id": "UjMT_qhjf8Fu" }, @@ -276,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": { "id": "IaoUf57ZwrJ8" }, @@ -306,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "metadata": { "id": "tYDoaKgJChiq" }, @@ -336,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "metadata": { "id": "gFbCgfFC2gHw" }, @@ -344,7 +358,7 @@ { "data": { "text/html": [ - "Query job 307ec006-490f-435d-b3e3-74eb1d73fe0f is DONE. 372.9 MB processed. Open Job" + "Query job e8946d0f-20f1-49ae-9af5-5136f45e792d is DONE. 372.9 MB processed. Open Job" ], "text/plain": [ "" @@ -359,13 +373,13 @@ "" ] }, - "execution_count": 9, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -419,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 25, "metadata": { "id": "LqqHzjty8jk0" }, @@ -439,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 26, "metadata": { "id": "g4MeM8Oe9Q6X" }, @@ -479,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 27, "metadata": { "id": "x95ZgBkyDMP4" }, @@ -500,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 28, "metadata": { "id": "V0OK02D7PJSL" }, @@ -508,7 +522,7 @@ { "data": { "text/html": [ - "Query job 44159a16-cab9-4ffa-be68-2228387a48c2 is DONE. 12.6 GB processed. Open Job" + "Query job 5b76ac5f-2de7-49a6-88e8-0ba5ea3df68f is DONE. 129.5 MB processed. Open Job" ], "text/plain": [ "" @@ -545,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 29, "metadata": { "id": "EG7qM3R18bOb" }, @@ -556,13 +570,13 @@ "" ] }, - "execution_count": 14, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -586,7 +600,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 30, "metadata": { "id": "5nVy61rEGaM4" }, @@ -597,13 +611,13 @@ "" ] }, - "execution_count": 15, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -622,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 31, "metadata": { "id": "-S1A9E3WGaYH" }, @@ -633,13 +647,13 @@ "" ] }, - "execution_count": 16, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAGxCAYAAACXwjeMAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAV55JREFUeJzt3Xl8FPX9P/DXzOyRczeEkAvCFa6AAaOUCMhhpRz6VQFbRW0VFbQUtf7AVsED0X4FjyqtWupXW6i2SrUVoVZRpAUVUOS+IpAYLpMQSMhuNpvsNZ/fH5ss2VxsNptsJvt6Ph5LsjOfnX3vZNh97+eUhBACRERERBolhzsAIiIiorZgMkNERESaxmSGiIiINI3JDBEREWkakxkiIiLSNCYzREREpGlMZoiIiEjTmMwQERGRpunCHUB7U1UVRUVFiI+PhyRJ4Q6HiIiIAiCEQGVlJdLT0yHLLde9dPlkpqioCBkZGeEOg4iIiIJw6tQp9OrVq8UyXT6ZiY+PB+A9GSaTKczREBERUSCsVisyMjJ8n+Mt6fLJTF3TkslkYjJDRESkMYF0EWEHYCIiItI0JjNERESkaUxmiIiISNOYzBAREZGmMZkhIiIiTWMyQ0RERJrGZIaIiIg0jckMERERaRqTGSIiItK0Lj8DMFFzVFXgUJEV5XYnEmMMGJZugixzMVIiIq1hMkMRaVv+OazcUoCCUhtcHgG9IiEzOQ7zJmRizICkcIdHREStwGYmijjb8s9h8doDyCu2QpElRBtkKLKEvGIrFq89gG3558IdIhERtQJrZiiiqKrAyi0FOG93wu0RqLC7IAQgSYBRJ8PlUbFySwGu6N+dTU5ERBrBmhmKKIeKrDhcZIWtxg270wO3KuARAm5VwO70wFbjxuEiKw4VWcMdKhERBYg1MxRRymwOWKpd8AjvfanuHwEIAB4BWKpdKLM5whckERG1CmtmKKKUVTnhVr2ZjCShNpvx/pRqf3erAmVVzrDER0RErcdkhiKKtdrl+10I/33179cvR0REnRubmSiiSLJU16oEoHFCA3grayR2/iUi0gzWzFBEyclIgF6RIcN78ddrZfJt0ysycjISwhQhERG1FpMZiijZPc0YnBpXW/0C6BQJelmCTpF82wanxiG7pzncoRIRUYCYzFBEkWUJi6ZloUe8EYosQdSOYhICUGQJPeKNWDQti3PMEBFpCJMZijhjBiThpZsuRW6/RCTE6BFr1CEhRo/cfol46aZLuZwBEZHGhDWZWbZsGX7wgx8gPj4eycnJmD59Oo4cOeJXZuLEiZAkye/285//PEwRU1cxZkASVs8ehUevGYq54/rh0WuGYvXsUUxkiIg0KKyjmbZs2YL58+fjBz/4AdxuNxYvXozJkyfj8OHDiI2N9ZWbO3cunnrqKd/9mJiYcIRLXUhTC02+v+c0F5okItKgsCYzGzZs8Lu/evVqJCcnY9euXRg/frxve0xMDFJTUzs6POqi6haatDnc6BZjgEGR4fSoyCuuxOK1B/DMjGwmNEREGtKp+sxYLBYAQGJiot/2v/3tb0hKSsIll1yCRYsWwW63hyM86gLqFpq0OdxINUUhSq9AliVE6RWkmoywOTxYuaUAqtrEBDRERNQpdZpJ81RVxYMPPoixY8fikksu8W2/9dZb0adPH6Snp2P//v14+OGHceTIEbz//vtNHsfhcMDhuLCujtXKBQPpgkNFVhSU2tAtxgBJ8h+xJEkSEmL0KCi14VCRFdm9ODybiEgLOk0yM3/+fBw8eBBffvml3/Z77rnH93t2djbS0tJw9dVXo6CgAJmZmY2Os2zZMixdurTd4yVtKrc74fIIGJSmKyWNigyLKlBu59pMRERa0Smame677z58+OGH+O9//4tevXq1WDY3NxcAkJ+f3+T+RYsWwWKx+G6nTp0KebykXYkxBugVCU6P2uR+h0eFXpaQGGPo4MiIiChYYa2ZEULg/vvvx9q1a7F582b069fvoo/Zu3cvACAtLa3J/UajEUajMZRhUhcyLN2EzOQ45BVXItUk+zU1CSFQYXchKy0ew9JNYYySiIhaI6w1M/Pnz8df//pXvP3224iPj0dJSQlKSkpQXV0NACgoKMDTTz+NXbt24fjx41i/fj1uv/12jB8/HsOHDw9n6KRRsixh3oRMxBkVlFgdqHZ5oKoC1S4PSqwOxBkVzJuQyRmAiYg0RBKiqXWDO+jJpaY/MFatWoXZs2fj1KlT+OlPf4qDBw+iqqoKGRkZmDFjBh577DGYTIF9c7ZarTCbzbBYLAE/hro+v3lmVAG9LCEzOY7zzBARdRKt+fwOazLTEZjMUHNUVeBQkRXldicSYwwYlm5ijQwRUSfRms/vTjOaiaijybLE4ddERF1ApxjNRERERBQsJjNERESkaUxmiIiISNOYzBAREZGmMZkhIiIiTWMyQ0RERJrGZIaIiIg0jckMERERaRqTGSIiItI0JjNERESkaUxmiIiISNOYzBAREZGmMZkhIiIiTWMyQ0RERJqmC3cAROGiqgKHiqwotzuRGGPAsHQTZFkKd1hERNRKTGYoIm3LP4eVWwpQUGqDyyOgVyRkJsdh3oRMjBmQFO7wiIioFdjMRBFnW/45LF57AHnFVsQadUiONyLWqENecSUWrz2Abfnnwh0iERG1ApMZiiiqKrBySwFsDjdSTVGI0iuQZQlRegWpJiNsDg9WbimAqopwh0pERAFiMkMR5VCRFQWlNnSLMUCS/PvHSJKEhBg9CkptOFRkDVOERETUWkxmKKKU251weQQMStOXvlGR4VIFyu3ODo6MiIiCxWSGIkpijAF6RYLToza53+FRoZclJMYYOjgyIiIKFpMZiijD0k3ITI7DebsLQvj3ixFCoMLuQmZyHIalm8IUIRERtRaTGYoosixh3oRMxBkVlFgdqHZ5oKoC1S4PSqwOxBkVzJuQyflmiIg0hMkMRZwxA5LwzIxsZKXFw+5wo9TmgN3hRlZaPJ6Zkc15ZoiINIaT5lFEGjMgCVf0784ZgImIugAmMxSxZFlCdi9zuMMgIqI2YjMTERERaRqTGSIiItI0JjNERESkaUxmiIiISNOYzBAREZGmMZkhIiIiTWMyQ0RERJrGZIaIiIg0jZPmUcRSVcEZgImIugAmMxSRtuWfw8otBSgotcHlEdArEjKT4zBvQibXZiIi0hg2M1HE2ZZ/DovXHkBesRWxRh2S442INeqQV1yJxWsPYFv+uXCHSERErcBkhiKKqgqs3FIAm8ONVFMUovQKZFlClF5BqskIm8ODlVsKoKoi3KESEVGAmMxQRDlUZEVBqQ3dYgyQJP/+MZIkISFGj4JSGw4VWcMUIRERtRaTGYoo5XYnXB4Bg9L0pW9UZLhUgXK7s4MjIyKiYDGZoYiSGGOAXpHg9KhN7nd4VOhlCYkxhg6OjIiIgsVkhiLKsHQTMpPjcN7ughD+/WKEEKiwu5CZHIdh6aYwRUhERK3FZIYiiixLmDchE3FGBSVWB6pdHqiqQLXLgxKrA3FGBfMmZHK+GSIiDWEyQxFnzIAkPDMjG1lp8bA73Ci1OWB3uJGVFo9nZmRznhkiIo3hpHkUkcYMSMIV/btzBmAioi6AyQxFLFmWkN3LHO4wiIiojdjMRERERJrGZIaIiIg0jc1MFLG4ajYRUdfAZIYiElfNJiLqOtjMRBGHq2YTEXUtTGYoonDVbCKirofJDEWU+qtmQwKqnR5U1rhQ7fQAErhqNhGRBrHPDEWUulWznR4VxZYaONweCAFIEmDUKegeZ+Cq2UREGhPWmplly5bhBz/4AeLj45GcnIzp06fjyJEjfmVqamowf/58dO/eHXFxcbjxxhtx5syZMEVMWpcYY4AqBL4/X40alweyJEGnSJAlCTUuD74/Xw1VFVw1m4hIQ8KazGzZsgXz58/HV199hY0bN8LlcmHy5Mmoqqrylfl//+//4V//+hfee+89bNmyBUVFRZg5c2YYoyYty0qNh0cIeFQBRQZkSYIEbzKjyIBHFfAIgazU+HCHSkREAQprM9OGDRv87q9evRrJycnYtWsXxo8fD4vFgj/96U94++238cMf/hAAsGrVKmRlZeGrr77CFVdcEY6wScPySiqhSIBOkeBWAZ0sIAEQgPe+IkGRvOW41AERkTZ0qg7AFosFAJCYmAgA2LVrF1wuFyZNmuQrM2TIEPTu3Rvbt29v8hgOhwNWq9XvRlSn3O6ELMlIN0cjWi9DFQJuVUAVAtF673ZZltlnhohIQzpNB2BVVfHggw9i7NixuOSSSwAAJSUlMBgMSEhI8CubkpKCkpKSJo+zbNkyLF26tL3DJY1KjDFAr0gw6GT07R6LGpcKt6pCJ8uI0suocavQe1T2mSEi0pBOUzMzf/58HDx4EGvWrGnTcRYtWgSLxeK7nTp1KkQRUlcwLN2EzOQ4nLe7AADRBgXxUXpEGxQAQIXdhczkOAxLN4UzTCIiaoVOkczcd999+PDDD/Hf//4XvXr18m1PTU2F0+lERUWFX/kzZ84gNTW1yWMZjUaYTCa/G1EdWZYwb0Im4owKSqwOVLs8UFWBapcHJVYH4owK5k3I5BpNREQaEtZkRgiB++67D2vXrsV//vMf9OvXz2//5ZdfDr1ej02bNvm2HTlyBCdPnsTo0aM7OlzqIsYMSMIzM7KRlRYPu8ONUpsDdocbWWnxeGZGNtdmIiLSmLD2mZk/fz7efvttrFu3DvHx8b5+MGazGdHR0TCbzbj77ruxYMECJCYmwmQy4f7778fo0aM5konaZMyAJFzRvztXzSYi6gIkIUTYFqGRpKY/OFatWoXZs2cD8E6at3DhQrzzzjtwOByYMmUK/vCHPzTbzNSQ1WqF2WyGxWJhkxMREZFGtObzO6zJTEdgMkNERKQ9rfn87hQdgImIiIiCxWSGiIiINI3JDBEREWkakxkiIiLSNCYzREREpGlMZoiIiEjTmMwQERGRpjGZISIiIk1jMkNERESaxmSGiIiINI3JDBEREWla0MlMRUUF3njjDSxatAjl5eUAgN27d+P7778PWXBEREREF6ML5kH79+/HpEmTYDabcfz4ccydOxeJiYl4//33cfLkSbz55puhjpOIiIioSUHVzCxYsACzZ8/GsWPHEBUV5dt+zTXX4PPPPw9ZcEREREQXE1Qy88033+Dee+9ttL1nz54oKSlpc1BEREREgQoqmTEajbBarY22Hz16FD169GhzUERERESBCiqZuf766/HUU0/B5XIBACRJwsmTJ/Hwww/jxhtvDGmARERERC0JKpn57W9/C5vNhuTkZFRXV2PChAkYMGAA4uPj8b//+7+hjpGIiIioWUGNZjKbzdi4cSO2bt2Kffv2wWaz4bLLLsOkSZNCHR8RERFRi4JKZuqMHTsWY8eOBeCdd4aIiIioowXVzPTss8/i73//u+/+TTfdhO7du6Nnz57Yt29fyIIjIiIiupigkpk//vGPyMjIAABs3LgRGzduxMcff4xp06bhV7/6VUgDJCIiImpJUM1MJSUlvmTmww8/xE033YTJkyejb9++yM3NDWmARERERC0JqmamW7duOHXqFABgw4YNvo6/Qgh4PJ7QRUdERER0EUHVzMycORO33norBg4ciLKyMkybNg0AsGfPHgwYMCCkARIRERG1JKhk5qWXXkLfvn1x6tQpPPfcc4iLiwMAFBcX4xe/+EVIAyQiIiJqiSSEEOEOoj1ZrVaYzWZYLBaYTKZwh0NEREQBaM3nd8A1M+vXr8e0adOg1+uxfv36Fstef/31gR6WiIiIqE0CrpmRZRklJSVITk6GLDffb1iSpE7VCZg1M0RERNrTLjUzqqo2+TsRERFROAU1NJuIiIioswhqNNNTTz3V4v4nnngiqGCIiIiIWiuoZGbt2rV+910uFwoLC6HT6ZCZmclkhoiIiDpMUMnMnj17Gm2zWq2YPXs2ZsyY0eagiIiIiAIVsj4zJpMJS5cuxeOPPx6qQxIRERFdVEg7AFssFlgsllAekoiIiKhFQTUz/f73v/e7L4RAcXEx3nrrLd86TUREREQdIei1meqTZRk9evTAHXfcgUWLFoUkMCIiIqJABJXMFBYWhjoOIiIioqC0us+My+WCTqfDwYMH2yMeIiIiolZpdTKj1+vRu3fvTrX+EhEREUWuoEYzPfroo1i8eDHKy8tDHQ8RERFRqwTVZ+aVV15Bfn4+0tPT0adPH8TGxvrt3717d0iCIyIiIrqYoJKZ6dOnhzgMIiIiouBIQggR7iDak9VqhdlshsVigclkCnc4REREFIDWfH4HVTNTZ+fOncjLywMADB06FJdffnlbDkdERETUakElM6dPn8Ytt9yCrVu3IiEhAQBQUVGBMWPGYM2aNejVq1coYyQiIiJqVlCjmebMmQOXy4W8vDyUl5ejvLwceXl5UFUVc+bMCXWMRERERM0Kqs9MdHQ0tm3bhpycHL/tu3btwrhx42C320MWYFuxzwwREZH2tObzO6iamYyMDLhcrkbbPR4P0tPTgzkkERERUVCCSmaef/553H///di5c6dv286dO/HLX/4SL7zwQsiCIyIiIrqYgJuZunXrBkmSfPerqqrgdruh03n7ENf9Hhsb26lmBmYzExERkfa0y9DsFStWtDUuIiIiopALOJm54447Wn3w5cuX4+c//7lv+DYRERFRqAXVZyZQzzzzTItNTp9//jmuu+46pKenQ5IkfPDBB377Z8+eDUmS/G5Tp05tz5CJiIhIY9o1mblYd5yqqiqMGDECr776arNlpk6diuLiYt/tnXfeCXWYREREpGFtWs6graZNm4Zp06a1WMZoNCI1NbWDIiIiIiKtadeamVDYvHkzkpOTMXjwYMybNw9lZWXhDomIiIg6kbDWzFzM1KlTMXPmTPTr1w8FBQVYvHgxpk2bhu3bt0NRlCYf43A44HA4fPetVmtHhUtERERh0KmTmVmzZvl+z87OxvDhw5GZmYnNmzfj6quvbvIxy5Ytw9KlSzsqRCIiIgqzdm1mGjduHKKjo0N2vP79+yMpKQn5+fnNllm0aBEsFovvdurUqZA9PxEREXU+QdXM7N69G3q9HtnZ2QCAdevWYdWqVRg6dCiefPJJGAwGAMBHH30UukgBnD59GmVlZUhLS2u2jNFohNFoDOnzEhERUecVVM3Mvffei6NHjwIAvvvuO8yaNQsxMTF477338Otf/zrg49hsNuzduxd79+4FABQWFmLv3r04efIkbDYbfvWrX+Grr77C8ePHsWnTJtxwww0YMGAApkyZEkzYRERE1AUFlcwcPXoUl156KQDgvffew/jx4/H2229j9erV+Oc//xnwcXbu3ImcnBzk5OQAABYsWICcnBw88cQTUBQF+/fvx/XXX49Bgwbh7rvvxuWXX44vvviCNS9ERETkE1QzkxACqqoCAD777DP8z//8DwAgIyMD586dC/g4EydObHFivU8++SSY8IiIiCiCBFUzM3LkSPzmN7/BW2+9hS1btuDaa68F4G0mSklJCWmARERERC0JKplZsWIFdu/ejfvuuw+PPvooBgwYAAD4xz/+gTFjxoQ0QCIiIqKWSOJiCyi1Qk1NDRRFgV6vD9Uh28xqtcJsNsNiscBkMoU7HCIiIgpAaz6/QzppXlRUVCgPR0RERHRRAScz3bp1gyRJAZUtLy8POiAiIiKi1gg4mVmxYoXv97KyMvzmN7/BlClTMHr0aADA9u3b8cknn+Dxxx8PeZBEREREzQmqz8yNN96Iq666Cvfdd5/f9ldeeQWfffYZPvjgg1DF12bsM0NERKQ9rfn8Dmo00yeffIKpU6c22j516lR89tlnwRySiIiIKChBJTPdu3fHunXrGm1ft24dunfv3uagiIiIiAIV1GimpUuXYs6cOdi8eTNyc3MBAF9//TU2bNiA119/PaQBEhEREbUkqGRm9uzZyMrKwu9//3u8//77AICsrCx8+eWXvuSGiIiIqCOEdNK8zogdgImIiLSnQybNU1UV+fn5KC0t9S06WWf8+PHBHpaow6iqwKEiK8rtTiTGGDAs3QRZDmwuJSIi6jyCSma++uor3HrrrThx4kSjVa8lSYLH4wlJcETtZVv+OazcUoCCUhtcHgG9IiEzOQ7zJmRizICkcIdHREStENRopp///OcYOXIkDh48iPLycpw/f9534+y/1Nltyz+HxWsPIK/YilijDsnxRsQadcgrrsTitQewLf9cuEMkIqJWCKpm5tixY/jHP/7hWy2bSCtUVWDllgLYHG6kmqJ8S3REyQpSTTJKrA6s3FKAK/p3Z5MTEZFGBFUzk5ubi/z8/FDHQtTuDhVZUVBqQ7cYQ6O1xiRJQkKMHgWlNhwqsoYpQiIiaq2gambuv/9+LFy4ECUlJcjOzoZer/fbP3z48JAERxRq5XYnXB4Bg9J0Hm9UZFhUgXK7s4MjIyKiYAWVzNx4440AgLvuusu3TZIkCCHYAZg6tcQYA/SKBKdHRZSsNNrv8KjQyxISYwxhiI6IiIIRVDJTWFgY6jiIOsSwdBMyk+OQV1yJFJMEh0vArarQyTKMegkVdhey0uIxLJ1zEhERaUVQyUyfPn1CHQdRh5BlCfMmZOL/vbsXR8/YoKoCQgCS5N3XPdaAeRMy2fmXiEhDguoADABvvfUWxo4di/T0dJw4cQIAsGLFiiYXoCTqbJxuFW6PgEcAKgCPANweAYdbvehjiYiocwkqmVm5ciUWLFiAa665BhUVFb4+MgkJCVixYkUo4yMKKVUVWPZxHizVLkgA9Irku0kALNUuLPs4D6rapVf5ICLqUoJKZl5++WW8/vrrePTRR6EoFzpRjhw5EgcOHAhZcEShduB7C46esUECYNDL0MkXbga9DAnA0TM2HPjeEu5QiYgoQEElM4WFhcjJyWm03Wg0oqqqqs1BEbWXvScr4PKoUGr7xKhCwKMKqLXLciiyBJdHxd6TFWGMkoiIWiOoZKZfv37Yu3dvo+0bNmxAVlZWW2Miajeitl+vKrz9ZpxuFU6P6vu9rnVJsP8vEZFmBDWaacGCBZg/fz5qamoghMCOHTvwzjvvYNmyZXjjjTdCHSNRyORkJECRJLhqsxYJ3pFMEN4ERxUCellCTkZCOMMkIqJWCCqZmTNnDqKjo/HYY4/Bbrfj1ltvRXp6On73u99h1qxZoY6RKGSGpZlg0MtwObyd1oXvnwsMehnD0jjPDBGRVrQ6mXG73Xj77bcxZcoU3HbbbbDb7bDZbEhOTm6P+IhCKq+kElE6GdVOD5oasCRLQJRORl5JJbJ7mTs+QCIiarVW95nR6XT4+c9/jpqaGgBATEwMExnSjHK7E27V27zUFAmAWwXXZiIi0pCgOgCPGjUKe/bsCXUsRO0uIVqPGpe3icmok6CTvbUxOtl7HwBqXB4kROtbOgwREXUiQfWZ+cUvfoGFCxfi9OnTuPzyyxEbG+u3n6tmU2cmajv7OtwX2plUAbhVAal2PxERaUdQyUxdJ98HHnjAt42rZpMWVFS7IMuAaOYSFQBk2VuOiIi0gatmU0QxG3UXXX/J4VZhNgb1X4OIiMIgqHfsEydOYMyYMdDp/B/udruxbds2rqpNnVbBuaqLNiMJ4S13aZ9uHRMUERG1SVAdgK+66iqUl5c32m6xWHDVVVe1OSii9lJsqb5oGSnAckRE1DkElczU9Y1pqKysrFFnYKLOpGdCTLPDshuWIyIibWhVM9PMmTMBeDv7zp49G0aj0bfP4/Fg//79GDNmTGgjJAqhKUOSG07424ioLUdERNrQqmTGbPbOiCqEQHx8PKKjo337DAYDrrjiCsydOze0ERKF0P9tC6zz+v9tK8SDkwa1czRERBQKrUpmVq1aBQDo27cvHnrooYs2KW3duhUjR470q8EhCqcDpywhLUdEROEXVJ+ZJUuWBNQ3Ztq0afj++++DeQqidmGKDix/D7QcERGFX1DJTKAEp1KlTuaKzO4hLUdEROHXrskMUWfTIy6wJs9AyxERUfgxmaGIct4e2DIFgZYjIqLwYzJDEcUa4JpLgZYjIqLwa9dkpqmJ9YjCKdArklcuEZF2sAMwRZQYoxLSckREFH7tOv60srKyPQ9P1GpHzwR2TQZajoiIwi+ompkzZ87gZz/7GdLT06HT6aAoit+NqLM6XVET0nJERBR+QdXMzJ49GydPnsTjjz+OtLQ09o0hzYhWAsvfAy1HREThF1Qy8+WXX+KLL77ApZdeGuJwiNrXJRlmrN1XFFA5IiLShqC+fmZkZLBzL2nSyD6JkC9SkShL3nJERKQNQSUzK1aswCOPPILjx4+HOByi9pXd04w+3WNaLNOnewyye7JmhohIKwJuZurWrZtf35iqqipkZmYiJiYGer3er2x5eXnoIiQKsTijDhKApuoWpdr9RESkHQG/a69YsSLkT/7555/j+eefx65du1BcXIy1a9di+vTpvv1CCCxZsgSvv/46KioqMHbsWKxcuRIDBw4MeSwUGQ4VWVFUUdNiMlNUUYNDRVZk92LtDBGRFgSczNxxxx0hf/KqqiqMGDECd911F2bOnNlo/3PPPYff//73+Mtf/oJ+/frh8ccfx5QpU3D48GFERUWFPB7q+spsDliqXVCb2a8CsFS7UGZzdGRYRETUBkHVp3/00UdQFAVTpkzx2/7pp5/C4/Fg2rRpAR1n2rRpzZYVQmDFihV47LHHcMMNNwAA3nzzTaSkpOCDDz7ArFmzggmdIlxZlRNuteXO625VoKzK2UERERFRWwXVAfiRRx6Bx+NptF1VVTzyyCNtDgoACgsLUVJSgkmTJvm2mc1m5ObmYvv27SF5Doo8lurAkpRAyxERUfgFVTNz7NgxDB06tNH2IUOGID8/v81BAUBJSQkAICUlxW97SkqKb19THA4HHI4LTQRWqzUk8VDXcMYSWPNRoOWIiCj8gqqZMZvN+O677xptz8/PR2xsbJuDaotly5bBbDb7bhkZGWGNhzqXlITA+loFWo6IiMIvqGTmhhtuwIMPPoiCggLftvz8fCxcuBDXX399SAJLTU0F4F0Hqr4zZ8749jVl0aJFsFgsvtupU6dCEg91DTkZCSEtR0RE4RdUMvPcc88hNjYWQ4YMQb9+/dCvXz9kZWWhe/fueOGFF0ISWL9+/ZCamopNmzb5tlmtVnz99dcYPXp0s48zGo0wmUx+N6I6siRBCWApsQPfW9o/GCIiComg+syYzWZs27YNGzduxL59+xAdHY3hw4dj/PjxrTqOzWbz62NTWFiIvXv3IjExEb1798aDDz6I3/zmNxg4cKBvaHZ6errfXDRErXG+yolAVuJ4ZdMxDOgRhzEDkto/KCIiapNWJzMulwvR0dHYu3cvJk+ejMmTJwf95Dt37sRVV13lu79gwQIA3jltVq9ejV//+teoqqrCPffcg4qKClx55ZXYsGED55ihoJ2zOZqdY6Y+u9ODlVsKcEX/7pAvtpgTERGFVauTGb1ej969ezc5NLu1Jk6c2OKClZIk4amnnsJTTz3V5uciAoBvSwIb3SbLEgpKbZwJmIhIA4LqM/Poo49i8eLFXIOJNOd0RU1A5QQAlypQbud8M0REnV1QfWZeeeUV5OfnIz09HX369Gk0HHv37t0hCY4o1KKVwPJ3VVWhl3VIjDG0c0RERNRWQSUz7IBLWnVJTxPW7iu6aDkBIDM5DsPSORqOiKizCyqZWbJkSajjIOoQpgBrWmIMOsybkMnOv0REGhBUnxkiraoMsA/MtKEpHJZNRKQRQdXMeDwevPTSS3j33Xdx8uRJOJ3+HxDsGEyd1f6iwEYzVTrbPlqPiIg6RlA1M0uXLsWLL76Im2++GRaLBQsWLMDMmTMhyzKefPLJEIdIFDoOV2BJSqDliIgo/IJKZv72t7/h9ddfx8KFC6HT6XDLLbfgjTfewBNPPIGvvvoq1DEShUyvhOiQliMiovALKpkpKSlBdnY2ACAuLg4Wi3cdm//5n//Bv//979BFRxRig9ICG50UaDkiIgq/oJKZXr16obi4GACQmZmJTz/9FADwzTffwGg0hi46ohBLijfiYgOUZMlbjoiItCGoZGbGjBm+1azvv/9+PP744xg4cCBuv/123HXXXSENkCiUkmKNiDO23O89zqhDUiyTGSIirQhqNNPy5ct9v998883o3bs3tm/fjoEDB+K6664LWXBEoZaVGg/1Istmq0IgKzW+gyIiIqK2CiqZaWj06NEYPXp0KA5F1K4OFVvhcLW8brbDpeJQsRUjMhI6JigiImqToCfNe+uttzB27Fikp6fjxIkTAIAVK1Zg3bp1IQuOKNT2nqyA+yI1M24hsPdkRccEREREbRZUMrNy5UosWLAA11xzDSoqKuDxeOfkSEhIwIoVK0IZH1FICSFwkVwGQnjLERGRNgSVzLz88st4/fXX8eijj0JRFN/2kSNH4sCBAyELjijUYqMCa1kNtBwREYVfUMlMYWEhcnJyGm03Go2oqqpqc1BE7SXQtZn2nTrfzpEQEVGoBJXM9OvXD3v37m20fcOGDcjKymprTETtJtC1mT45dAaqyqYmIiItCKoufcGCBZg/fz5qamoghMCOHTvwzjvvYNmyZXjjjTdCHSNRyFQHuOaStdqJQ0VWZPcyt3NERETUVkElM3PmzEF0dDQee+wx2O123HrrrejZsyd+97vfYdasWaGOkShkonSBVUYKAOUBNkkREVF4BZXMVFdXY8aMGbjttttgt9tx8OBBbN26Fb169Qp1fEQhlRwX2My+OllGYoyhnaMhIqJQCKrPzA033IA333wTAOB0OnH99dfjxRdfxPTp07Fy5cqQBkgUSqkJUQGVSzEZMSydi00SEWlBUMnM7t27MW7cOADAP/7xD6SkpODEiRN488038fvf/z6kARKFkrXGHVC5nN7dIF9sRUoiIuoUgkpm7HY74uO9a9d8+umnmDlzJmRZxhVXXOGbDZioMzpb6QioXJReuXghIiLqFIJKZgYMGIAPPvgAp06dwieffILJkycDAEpLS2EysWqeOi+7M7DRTIGWIyKi8AsqmXniiSfw0EMPoW/fvsjNzfUtMvnpp582OZkeUWfRPVYf0nJERBR+QY1m+vGPf4wrr7wSxcXFGDFihG/71VdfjRkzZoQsOKJQC7QXDHvLEBFpR9AL0KSmpiI1NdVv26hRo9ocEFF7CrTPTKDliIgo/IJqZiLSqsIye0jLERFR+DGZoYii1wXWgBRoOSIiCj8mMxRRxmYmhbQcERGFH5MZiihX9EsMaTkiIgo/JjMUUV7/4nhIyxERUfgxmaGIEuhK2Fwxm4hIO5jMUETpERvYStiBliMiovBjMkMRpcblCmk5IiIKPyYzFFGOn68JaTkiIgo/JjMUUYxKYJd8oOWIiCj8+I5NESVOH9glH2g5IiIKP75jU0Q5Xx1YX5hAyxERUfgxmaGIYqlxh7QcERGFH5MZiihCVUNajoiIwo/JDEUUnRLYApKBliMiovBjMkMRxVIdWI1LoOWIiCj8mMxQRAk0RWEqQ0SkHUxmiIiISNOYzFBEESEuR0RE4cdkhoiIiDSNyQwRERFpGpMZIiIi0jQmM0RERKRpTGaIiIhI05jMEDVDVTmmiYhIC5jMEDXjwPeWcIdAREQBYDJD1Iy9JyvCHQIREQWg0yczTz75JCRJ8rsNGTIk3GFRBBBca5KISBN04Q4gEMOGDcNnn33mu6/TaSJs0ricjIRwh0BERAHQRFag0+mQmpoa7jAowmT3NIc7BCIiCkCnb2YCgGPHjiE9PR39+/fHbbfdhpMnTzZb1uFwwGq1+t2IgiHLbGciItKCTp/M5ObmYvXq1diwYQNWrlyJwsJCjBs3DpWVlU2WX7ZsGcxms++WkZHRwRETERFRR5KEEJqaTKOiogJ9+vTBiy++iLvvvrvRfofDAYfD4btvtVqRkZEBi8UCk8nUkaFSJ9T3kX8HXPb48mvbMRIiImqJ1WqF2WwO6PNbE31m6ktISMCgQYOQn5/f5H6j0Qij0djBUREREVG4dPpmpoZsNhsKCgqQlpYW7lCIiIioE+j0ycxDDz2ELVu24Pjx49i2bRtmzJgBRVFwyy23hDs06uK4nAERkTZ0+mam06dP45ZbbkFZWRl69OiBK6+8El999RV69OgR7tCoiztUZEV2Lw7PJiLq7Dp9MrNmzZpwh0AR6mxlDQAmM0REnV2nb2YiCpf9XGiSiEgTmMwQNaPUWhPuEIiIKACdvpmJKFz2n6rwu6+qAoeKrCi3O5EYY8CwdBNnCSYi6gSYzBA142BxJT4/Worxg5KxLf8cVm4pQEGpDS6PgF6RkJkch3kTMjFmQFK4QyUiimhsZiJqwS/e3oP/25KPxWsPIK/YilijDsnxRsQadcgrrsTitQewLf9cuMMkIopoTGaIWmCrceOFjUdx3u5EqikKUXoFsiwhSq8g1WSEzeHByi0FnJOGiCiMmMwQXYTTLeB0C6Bh9xgJiNbLOFxkxbq9RUxoiIjChMkMUQBcHg9qnKrvvs3hxvFzdhRbqnHe7sTTHx7CHat2sMmJiCgMmMwQBUAVgFv1JjM2hxvfn69GjcsDSZKgSBL70BARhRGTGaIACAEokgQBgbOVDqhCQJG9SY5RL8McrWcfGiKiMGEyQxQAnSKhrMqJEksNqp1uSBLgVr0JTo/4KEiSBEmSkBCjR0GpDYeKrOEOmYgoYnCeGaIAGBQJVU433B4BAUD1CBj1MtLM0YgzXvhvZFRkWFSBcrszfMESEUUYJjNEAaiq7fyrk4HarjPweAQcbg+EENDJMqIMMhweFXpZQmKMIYzREhFFFiYzRK3grk1kJAAuVaC4ogaKLEGSvE1OiixhYEo8slLjwxonEVEkYZ8ZoiCIej8FBFwegRq3iiqnB9+drcSdf/mGo5qIiDoIkxmiNvLUq62RJcDtAQ4XWTlMm4iogzCZIWojGYBBJ8Gok6FXZLhUFQnReg7T7mRUVeDAaQu2HD2LA6ct/LsQdSHsM0PURgKAItV+LxACQgAeIfyGaWf3Moc1xkjHVc+JujbWzBC1kQDg8qhQhYC79tu+IkswKjJcHoHdJ86zNiCMtuWf46rnRF0ca2aCpKoCh4qsKLc7kRhjwLB0E2S54UqEFCnc6oVERgLw/flqyLIEh1vFS58dgV5R2q02oP61mBCtBwBUVLt4XcJ7blZuKYDN4UaqyTu5IQBEyQpSTTJKrA6s3FKAK/p3j+jzRKR1TGaCwCpraokAUOO+sChlRbUbUToPusdF+WoDnpmRHZJrpf61WOXwoNrlgSQBUXoFsQYl4q/LQ0VWFJTa0C3G4Etk6jScsZlNgUTaxWamVmKVNQWjxi1wxloDvQKcr3LiuU+OwF0v4QlG/WtRkoBqlxseVYXbo8LucEOSpIi/LsvtTrg8Agal6bc6oyLDxRmbiTSPyUwrNKyyjtIrkGUJUXqFiwzSRblVgWKLA5U1bhw4XYEfv7Y96CSj/rWYEm+EpdoFjwD0igy9ToYAYKl2IcVkiOjrMjHGAL0iwelpOnHkjM1EXQOTmVZoTZU1UXNUAB4B5BVbsfC9ffjy2NlWH6P+tehwCzjcKnSyd7FLCd6ZiB1uDxwuEdHX5bB0EzKT43De7oIQ/smcEAIVdhcyk+MwLN0UpgiJKBSYzLQCq6y1rbPVTDjcKootNbj3rV14c/vxVsVX/1p0qyqE8HY8FhBQhYAQAqoA3KoKgyKh2uXBlqOlvhFVbreKtbu/xyv/OYa1u79vc5NXZyXLEuZNyEScUUGJ1YFqlweqKlDt8qDE6kCcUcG8CZns/EukcewA3Ar1q6yjZKXRflZZd26dtWaiyunBknWHsHrrcVw5MAmZSXG4tHcCsnuam/2QrX8t6mQZkgS4hYDq8c5zU5cWFVtqAAi4PQJvfFGIt78+CaNextlKB6qdHqjwfqNZ+uEhzJ+YibnjM5t8Pi2P3hszIAnPzMj2dZS2qAJ6WUJWWnxEd44m6kqYzLRCXZV1XnElUk2yX1NTXZV1Vlo8q6w7qc5cYyYAfHeuCt+dqwIA6BUJvRNjsPT6YbhyYA8AjYdg9+8Rh29LKpESb4AiS6hxeWtX6q5KCd7aHwAw6mSkm6NQWlmDIksNAO8K4AZZgkcVsNhdWL7hCFQB3DvBP6HZln8Of9hcgCMllXB6VBgUGYNT4/GLidpJBMYMSMIV/btrNiEjopYxmWmFuirrxWsPoMTqQEKMHkZFhsOjosLuYpV1J6elGjOXR6DgbBXuWv0NbruiD5Ljo/DJoRKUWmt80wF0jzNAkYEzlQ6o9VqJRIOfACBJACSB83aXb5tHwK8mx6MKPP/JEQxOjUP32CiU2504VW7Hy/85hvIqJ0Tt7MaSBHxd6MSx0kq8dNOlmkloZFni8GuiLkoSDXvFdTFWqxVmsxkWiwUmU2hqTPzmmamtso70+Ty0QFUF+i/+KNxhBE2Cd/6YZJMRBkXGebsLigzEGXU4fq4KAkBdtxtJgq8fjU6WAAnoFm1Aqc3hO5aod1zUlgcAnQKYo/SQIMFS7YJLFZAB6HWy77hujwoBb23luvlXMoEnopBrzec3a2aCwCprCgcBoNrlwenz1UgxGZFiMuCM1YkonYxogw4uj3eOmbqkRABQZEBRJLg9wm94csNEpj63B6h2qkiON6CsyukrLwDIkCBJ3sTG5VZx9IwNB763YERGQnu+dCKiFjGZCRKrrLWlrjatK/CoAkUVNThncyJaL+NEeTXsTre3Fkbx1p54VG9nYLcKSJKAJKHZUXgNKTLgUlXYHB6/Jiu3R4WskyHhwvBvl0fF3pMVTGaIKKyYzFCX0tSom6++K8PitQdgc7jDHV5IOd0qnPWGVEsAPKoKRZYhS4AseZudXB6BOKMO0YbGNYcCgCT8+9co8D5OrdcCLcHbvFTXZ8a3EYBghSQRhRmTGdK0+snLqXI7NhwswXdnL6yZ1b9HLCzVLt+szedsnXdEU1sJAG7VO7cM4E1m6igyUFTh8OsrU/9x9ctJsgxJCETrFUhw+ZqYgNp+NZJ39J7HI6CTZeSwVoaIwozJDGlWw0UWbU43ZAlIjo9CcrwBTo+Kg99bUVmbyESa+nPwWavdjZKYhmQJUCQJblUgWi8jIUaPMrvTN+Tb68JkfAJAssmIz4+WIr/UhsykWFgcbvYhI6IOx2QmSDW1qxPLkgRZkiDBW/3ecJkDah91iyzaHG4kROthqXYBtbPenq10wKCTEWfUwRyth6XGhQq7Ew63J9xhh00gQxb1igynW4UiSzBF6SHJEtLM0ThVXuXte4PaBKl2iLYQQNH5avx24zEA3us/SqcgxqAgIzEGD00e5JsjB2h+4j23W8W/9hfj+wo70szRQSVFF5vUr25/mc2B83YXEmL1SIo1MukiP8FMDqnlCSW7EiYzQRq+9FO//gp16hIcCbU/fQmPN9FpeN/vJ+rdly/c9z/Ohd8vduy6GGTZ+xO4sL/u8VK9+7LkHcLbsExLj/EmcVK9560r18z9utckN3Xsupjr9jd4TO3zA8DrX3yHczYHEqINqKxx1yaX3v1uVUVxRTWSTUa43AISvKOAqpyRm8wEwq2K2qYqgdLKGliqXd65lPQKJJcHRr2utuZG9Z1LWYZ3sSl4k5tqlwfVLg/O252Y8+ZOLPzRIMwdn+k/nYGvCTAOsUYFXx47511moN6w8hiDAnOUPqApD5o6dv3H1e0/XGSFtcYFVRWQZQmmKB2Gpptx7/j+MEcb/D6MALT5Qy0rNR55JZW+SQ5VIbDvtAWSwEVneG4vdTGeq3KgosqFbjF6dI8LPqlrOJEjAFRUu1r9od5cQtDa7W1xsesoVI/pSjpTIsd5ZoI06NGPm12Jl0irFBlQ1QYT7uFCUmzUybA7vYlHU/1vGh1PAi7paUbBWRvcHgGDToYiS3C61Ysml9F6GYAEg07Cj7JS0Kd7bL1k2pvgniiz4+ODxXC6VcQYFCiyDI+qwu5SEaWTMKpfd+woLIfd6YbD7R22LkvwLeNQ98Fo1CnQKVJtrZT3O15ljRse1XtO0hOicf2IdFzSy+xLrgHv81c6XCi1OvDVd2X4vqIabk/t2ljwPpdHBWpcbtR995HgneG5X1Is7rtqAEb2S/T/koJ6X1zkpr8cNfziEogLSZ0F1hp3vaROj6HpplZ/ANf/ILfWuOBweavvjDoFRp2MjMRo3HhZL/TuHtviB11zCcH4gUnYcvSc38zTg1LiMCg1HjsKy1Fq9c6ZVFe+qaQ00A/W+jW93WIMMCgynB4V52snQ31mRnajcxPMY7qSjkjkWvP5zWQmSAMWfwR3J1u4kIgiU8Mkp2HyowqBGpfHb90uv8fDm9glxhgQbVCaOd6FRNLudKOooto7BUAzx6wjS94mzBiDgj7dY9A91uirRT1f5cThYivcHhUGnVwbq7cZv+79tbmkWYI30TTqFLg8Kjyq9wNVkiQokoRusQZc1rsbenePuVCrjLrE/MJ9CcDfvzmFM5U13uZVv+RQwFrtRpo5CneP6+cbKQgAr3/+Hb6vqEZCdL3H1AZbYXchIzEa/2/SIN90Cb7ks3auJln2rxVvtoa9YQ02Wk5sm6uhV+rVdtevZb/otdXEtm0F5/Bog0TOpYqQJ3JMZuppr2SmtLIGEBeGsKq1/Qjq/1SFd9SHd2ZWUfuNt97P2v0XHgPfasd12+rKqarwPbbJYzdxrLrVkxveb/F56sqr/sf2Pb+oG6Ir4KktIxqcA7/XpvofW23wmObOmaeJfXU/bQ43jp2p9DVNAQIOt+qrLQC8b34GRYLLIwLqL0JEFGnq9/X0Jau1Oxoms/XLVNa44FYFlNpkSKpN0jJ7xOKszYmstHj85c5RbW5y4gzAHSA5PvJGx3QWqipwx6odtQt+er/l2RxufH++Gh7hbSOJ0ivoFmtAUUU1dLKEKJ2MSgf7zISLKUrn+wZtrXE1W0PQUJReRpROQbXLg8v6dEOPOKM3ya7t17PvlAXGumUWAN9B6yb5q3Gr0NfOgOybFyeAJ66rDdAr3gkCgQsJtSx7JyWUJcm3phWRFtV9sYQQ8Pi2BM7t959JQJIkJMToUVBqw6Eia4dOLMtkhjSnqQU/Y/QKkuINOFvpgArvTLjVTg8U2VvdfL52TSLqeBIAc7QeCTEGVNZ45/yBLBDI4LKkOCOi9ArsDjcWT8vye3M8cNqCe9/aiVijDlF6pdFjz9udKK6oRmKsAeVVTl8n+LqO+w37BSmy5Gva0Cve5o6eCdGIj/J2bFVVgSJLNSQA3Wvjqqxx4fR5b8IsSRLcHhUuVcCgeBsw6vrVSbX/1L33GxQZkiTgcgvERenw4s2X4soBSRACtbWStTWYqv/9pmoqG9Zq+v2EwDeF5/HbT48gSq/gbGUNFLmucaWOgFsVSIozwOEWmDu+P7J7mn21uh7V/7kPFlmw6stCVLs83okaA0kOJaB7rMF3/n9yeQZsTjc+3FeEWKPuwkSMAqhyevwmuKyb/LHRMWuPW7dPrv2bxhoV6BQZEN7O7B5VxQ/6JiI+Sg+BxufLUu3C/tMWKHLtIIjaMnXXiFpbC53ZIw7ResVXO3yyzA5JlnzPe+FsXqiV7hFvhF65kPTW1Wo3rFFvWHvf1N/Y9/vFT3dYGRUZFlWg3N6xc3oxmSFNGjMgCc/MyPZ1QLPULviZ2y8RUy9JQ0ZiDMptTjz94SGU25xgX+3wkOBdCNPu9MAc7Z1kTwqk5zC8yYU5Soczld5q67pRRnWGpZuQmRxXW0Mn+7X/CyFQ7fQgPloPjypgUGQ43Cpkqemn9lanX6iRUYV3m06+sASEw6NCliTf8QD4Xo+ofa1S7eguVXj7czR3TqTa7EaWvZ2PU+KjmkzIQsHlFrWdo6V6owQvnKu6WI06BTpZYMLAHi1+o+6ZEI13vj6JGpcHiiLB4774H1OG9zqINehQanNg/OAeSIwx4IujZxslo+dsDl8yU5dkqh7RqO9MXfJRlzDW9YNJNUUj2uA9nqoKlNocuHtcf0wY1ANNaaqmt44QAiVWR6Nmk2AeE0r+3QD8k526hEdVhV8C1FTXAb+EqV4Zj6r6julW6yfTwLGSSjz/ybeIMigwKHK9dd68zVA1bhX62v5XHYnJDGnWxRb8dLtVLP3wkK9joDOQr5AUUka9jPt+OADv7DhZW4umg0GRvUPp0XJOkxCtx5lKJ+KMCuZNyGz0odBUDZ1RkeHwqKiwuxAfpcNtub3xt69PwuVR4XCjyU77ErxJiXeYubf9SwiBKL2CKL03IxFC1HbqjMEZSzWcHhVRsne/USej2qVCX/sVXaotX1/9JjDv556A2+MdTTQ4tXGiFkp1Sd/hIqv33Lu9sUqSBAFvrYP3NXiQlWa6aCzD0k3ISIzBebsTQm2cZDRUl7zpZO/fpu6DrrlkNKZBUufts9H4GeqmfqjT8G8GwO/5mnOx66ip6y+Yx4RS3XGVJrvntq+cjAT8+2Ax8oor0S1G1yiRq7C7mvzy0d4CW3mOqJOqW/BzwiDvt8n6bx55JZVQJECnSPB07X7uHa57rB46qemRDnWi9DIW/mgQ7p2QiWdmZCMrLR52h8fbBCBJ0CkSTFEKGr7fS/A2FRh1MrLS4lscGVFXQ+c9thulNgfsDrfvcXPHe597eK8EmKP1tU0sXnU1MVLth6Ko15FHADBF631z55RYHYgzKnho8iAMSInHebsLQnj7CPSIj4IiSXCpqi8xkGUJTreAIqHR65MleIdvw9v08ouJ7fehB1z44I2P0vlG47hUFW5Vhau2yU2nSIgz6gL6AJZlCQ9NHgSDToZbbb4GCrhwfqP0Ohj1EirsLmQmx/m+dMybkIk4o4ISq8M715AqIMkXjumt9fL+URr9D5b8tyqy929R9+Fa98Fa93wtudh11NT1F8xjuoLm/m71/5+0ZyLXHI5moi5ry9GzeOjdfYgxKCircvjmR6G2idLJGJAchyqnB2cra1DtUn2dYHUyEK1XkJkc3+IMwN51tIrx3dkqONweeFQBc7QB07JTMXVoarvOAPz18XL8c+cpOD0C0QYZ1mrvHDQe4e3U2zsxGnFGHcpsTrhqmy8bTsLnnV/E4/tGfr7a6e2vJbzNKTrZ25ekbnkIu9Pt7YQM7we0vnbOlEXTsjrsQy/U88y8/nkBfrvxKJy18/c09V9LhjfJSDZFweFWmxy26zdfSe357h5nwOnz1bA53H59XABvDmPUy3B7vKMevTVrEqL0OiSbjI1qSFqTWHAG4MA19XfjPDPtiMlM5KrfQdQ72Zsb352zhzssTTIZdVDh7YcSa9T7PjRq3B6U2ZwwKBJuHJmBUf0SA14mIJwfAvXfiOv6XKSaojBrVG/cOqo3gJZnAG7qjbx/j1hff61ImQH4y2Nn8cKnR3GyrAoOt1q7ZIgEneztWyTLEqL0CmINSosfdM2tdv+Hzfn4tqQSLk9tBw7pQj8l4MLfrG/3GLz2+Xft+sFKjbX3/2EmM/UwmYlcTXXSO/C9Jdxh+cQbFSRGexdzrHZ5O6de3icBsQYdjpyx4ZzNgZp6nSsVyX/kSID9aIMmA1AUCdF6BXFGnW9W1s+PnesSHxptfSOO1G/kDTW3pEFbljdo6tgNE8SGx+Tfo+thMlMPk5nI1rBJIL/U1u7PmRClwK16RwTIsoxkkxF3jO6D+CgDii3VqHJ6sP/UeRSeszebENS9MX+ZfxafHDqDM5ZquAWgk4AUczSmDEvBmExv2fNVTt/iiSfKqvDX7cdx6nwNVCFgkIE+3WMxK7cPhBD4y7bjOFM7DbxRJyEjMQbDM7ohWpEBCUg1R8EcbUBinMHXabLhhxE/NIioIzCZqYfJDNVvEiiy1LTb8+hlCbn9E7F69qhmvz3WaU1C0NrkoaXyTESISCuYzNTDZIaACx/i173yZbsc36jISEuI6tKjGIiIOhKXMyBqoG4Id1soAHolRuOSdBO+Pn4e1moXZFmC2ahgUJpZk/1GiIi6AiYzRC3I7ZuA4Rnd0NMcjZw+3XwjUNhcQ0TUeTCZIWrGrMvSsfymnCb3haKmh4iIQoMzABM1wxTHldGJiLRAE8nMq6++ir59+yIqKgq5ubnYsWNHuEOiCDDtktRwh0BERAHo9MnM3//+dyxYsABLlizB7t27MWLECEyZMgWlpaXhDo26uBG9EsIdAhERBaDTJzMvvvgi5s6dizvvvBNDhw7FH//4R8TExODPf/5zuEOjLo4deomItKFTJzNOpxO7du3CpEmTfNtkWcakSZOwffv2MEZGWrVz0VUhLUdEROHXqUcznTt3Dh6PBykpKX7bU1JS8O233zb5GIfDAYfD4btvtVrbNUbSliRzDGL0MuwutdkyMXoZSeaYDoyKiIjaolPXzARj2bJlMJvNvltGRka4Q6JO5vDT0xCjb/rSj9HLOPz0tA6OiIiI2qJTJzNJSUlQFAVnzpzx237mzBmkpjY90mTRokWwWCy+26lTpzoiVNKYw09Pw85FVyE5zgijIiE5zoidi65iIkNEpEGdupnJYDDg8ssvx6ZNmzB9+nQAgKqq2LRpE+67774mH2M0GmE0GjswStKqJHMMdjw26eIFiYioU+vUyQwALFiwAHfccQdGjhyJUaNGYcWKFaiqqsKdd94Z7tCIiIioE+j0yczNN9+Ms2fP4oknnkBJSQkuvfRSbNiwoVGnYCIiIopMkhBChDuI9tSaJcSJiIioc2jN53en7gBMREREdDFMZoiIiEjTmMwQERGRpjGZISIiIk1jMkNERESaxmSGiIiINK3TzzPTVnUjz7ngJBERkXbUfW4HMoNMl09mKisrAYALThIREWlQZWUlzGZzi2W6/KR5qqqiqKgI8fHxkCQp3OGEndVqRUZGBk6dOsVJBMHz0RDPR2M8J/54PvzxfPgL5fkQQqCyshLp6emQ5ZZ7xXT5mhlZltGrV69wh9HpmEwm/serh+fDH89HYzwn/ng+/PF8+AvV+bhYjUwddgAmIiIiTWMyQ0RERJrGZCbCGI1GLFmyBEajMdyhdAo8H/54PhrjOfHH8+GP58NfuM5Hl+8ATERERF0ba2aIiIhI05jMEBERkaYxmSEiIiJNYzLTxZSXl+O2226DyWRCQkIC7r77bthsthbL33///Rg8eDCio6PRu3dvPPDAA7BYLH7lJElqdFuzZk17v5ygvPrqq+jbty+ioqKQm5uLHTt2tFj+vffew5AhQxAVFYXs7Gx89NFHfvuFEHjiiSeQlpaG6OhoTJo0CceOHWvPlxBSrTkfr7/+OsaNG4du3bqhW7dumDRpUqPys2fPbnQtTJ06tb1fRsi05nysXr260WuNioryKxNJ18fEiRObfC+49tprfWW0fH18/vnnuO6665Ceng5JkvDBBx9c9DGbN2/GZZddBqPRiAEDBmD16tWNyrT2Pakzae05ef/99/GjH/0IPXr0gMlkwujRo/HJJ5/4lXnyyScbXSNDhgxpW6CCupSpU6eKESNGiK+++kp88cUXYsCAAeKWW25ptvyBAwfEzJkzxfr160V+fr7YtGmTGDhwoLjxxhv9ygEQq1atEsXFxb5bdXV1e7+cVluzZo0wGAziz3/+szh06JCYO3euSEhIEGfOnGmy/NatW4WiKOK5554Thw8fFo899pjQ6/XiwIEDvjLLly8XZrNZfPDBB2Lfvn3i+uuvF/369euUr7+h1p6PW2+9Vbz66qtiz549Ii8vT8yePVuYzWZx+vRpX5k77rhDTJ061e9aKC8v76iX1CatPR+rVq0SJpPJ77WWlJT4lYmk66OsrMzvXBw8eFAoiiJWrVrlK6Pl6+Ojjz4Sjz76qHj//fcFALF27doWy3/33XciJiZGLFiwQBw+fFi8/PLLQlEUsWHDBl+Z1p7jzqa15+SXv/ylePbZZ8WOHTvE0aNHxaJFi4Rerxe7d+/2lVmyZIkYNmyY3zVy9uzZNsXJZKYLOXz4sAAgvvnmG9+2jz/+WEiSJL7//vuAj/Puu+8Kg8EgXC6Xb1sgF3FnMGrUKDF//nzffY/HI9LT08WyZcuaLH/TTTeJa6+91m9bbm6uuPfee4UQQqiqKlJTU8Xzzz/v219RUSGMRqN455132uEVhFZrz0dDbrdbxMfHi7/85S++bXfccYe44YYbQh1qh2jt+Vi1apUwm83NHi/Sr4+XXnpJxMfHC5vN5tum5eujvkDe837961+LYcOG+W27+eabxZQpU3z323qOO5NgPweGDh0qli5d6ru/ZMkSMWLEiNAFJoRgM1MXsn37diQkJGDkyJG+bZMmTYIsy/j6668DPo7FYoHJZIJO57/axfz585GUlIRRo0bhz3/+c0ArmXYkp9OJXbt2YdKkSb5tsixj0qRJ2L59e5OP2b59u195AJgyZYqvfGFhIUpKSvzKmM1m5ObmNnvMziKY89GQ3W6Hy+VCYmKi3/bNmzcjOTkZgwcPxrx581BWVhbS2NtDsOfDZrOhT58+yMjIwA033IBDhw759kX69fGnP/0Js2bNQmxsrN92LV4fwbjY+0cozrHWqaqKysrKRu8hx44dQ3p6Ovr374/bbrsNJ0+ebNPzMJnpQkpKSpCcnOy3TafTITExESUlJQEd49y5c3j66adxzz33+G1/6qmn8O6772Ljxo248cYb8Ytf/AIvv/xyyGIPhXPnzsHj8SAlJcVve0pKSrOvv6SkpMXydT9bc8zOIpjz0dDDDz+M9PR0vzfjqVOn4s0338SmTZvw7LPPYsuWLZg2bRo8Hk9I4w+1YM7H4MGD8ec//xnr1q3DX//6V6iqijFjxuD06dMAIvv62LFjBw4ePIg5c+b4bdfq9RGM5t4/rFYrqqurQ/J/UOteeOEF2Gw23HTTTb5tubm5WL16NTZs2ICVK1eisLAQ48aNQ2VlZdDP0+UXmuwKHnnkETz77LMtlsnLy2vz81itVlx77bUYOnQonnzySb99jz/+uO/3nJwcVFVV4fnnn8cDDzzQ5uelzmn58uVYs2YNNm/e7NfpddasWb7fs7OzMXz4cGRmZmLz5s24+uqrwxFquxk9ejRGjx7tuz9mzBhkZWXhtddew9NPPx3GyMLvT3/6E7KzszFq1Ci/7ZF0fVDL3n77bSxduhTr1q3z+6I9bdo03+/Dhw9Hbm4u+vTpg3fffRd33313UM/FmhkNWLhwIfLy8lq89e/fH6mpqSgtLfV7rNvtRnl5OVJTU1t8jsrKSkydOhXx8fFYu3Yt9Hp9i+Vzc3Nx+vRpOByONr++UElKSoKiKDhz5ozf9jNnzjT7+lNTU1ssX/ezNcfsLII5H3VeeOEFLF++HJ9++imGDx/eYtn+/fsjKSkJ+fn5bY65PbXlfNTR6/XIycnxvdZIvT6qqqqwZs2agD54tHJ9BKO59w+TyYTo6OiQXHNatWbNGsyZMwfvvvtuo6a4hhISEjBo0KA2XSNMZjSgR48eGDJkSIs3g8GA0aNHo6KiArt27fI99j//+Q9UVUVubm6zx7darZg8eTIMBgPWr1/faOhpU/bu3Ytu3bp1qvVIDAYDLr/8cmzatMm3TVVVbNq0ye/bdX2jR4/2Kw8AGzdu9JXv168fUlNT/cpYrVZ8/fXXzR6zswjmfADAc889h6effhobNmzw63/VnNOnT6OsrAxpaWkhibu9BHs+6vN4PDhw4IDvtUbi9QF4pzNwOBz46U9/etHn0cr1EYyLvX+E4prTonfeeQd33nkn3nnnHb9h+82x2WwoKCho2zUS0u7EFHZTp04VOTk54uuvvxZffvmlGDhwoN/Q7NOnT4vBgweLr7/+WgghhMViEbm5uSI7O1vk5+f7DZVzu91CCCHWr18vXn/9dXHgwAFx7Ngx8Yc//EHExMSIJ554IiyvsSVr1qwRRqNRrF69Whw+fFjcc889IiEhwTec9mc/+5l45JFHfOW3bt0qdDqdeOGFF0ReXp5YsmRJk0OzExISxLp168T+/fvFDTfcoKmht605H8uXLxcGg0H84x//8LsWKisrhRBCVFZWioceekhs375dFBYWis8++0xcdtllYuDAgaKmpiYsr7E1Wns+li5dKj755BNRUFAgdu3aJWbNmiWioqLEoUOHfGUi6fqoc+WVV4qbb7650XatXx+VlZViz549Ys+ePQKAePHFF8WePXvEiRMnhBBCPPLII+JnP/uZr3zd0Oxf/epXIi8vT7z66qtNDs1u6Rx3dq09J3/729+ETqcTr776qt97SEVFha/MwoULxebNm0VhYaHYunWrmDRpkkhKShKlpaVBx8lkpospKysTt9xyi4iLixMmk0nceeedvg8iIYQoLCwUAMR///tfIYQQ//3vfwWAJm+FhYVCCO/w7ksvvVTExcWJ2NhYMWLECPHHP/5ReDyeMLzCi3v55ZdF7969hcFgEKNGjRJfffWVb9+ECRPEHXfc4Vf+3XffFYMGDRIGg0EMGzZM/Pvf//bbr6qqePzxx0VKSoowGo3i6quvFkeOHOmIlxISrTkfffr0afJaWLJkiRBCCLvdLiZPnix69Ogh9Hq96NOnj5g7d65m3piFaN35ePDBB31lU1JSxDXXXOM3X4YQkXV9CCHEt99+KwCITz/9tNGxtH59NPd+WHcO7rjjDjFhwoRGj7n00kuFwWAQ/fv395tzp05L57iza+05mTBhQovlhfAOX09LSxMGg0H07NlT3HzzzSI/P79NcXLVbCIiItI09pkhIiIiTWMyQ0RERJrGZIaIiIg0jckMERERaRqTGSIiItI0JjNERESkaUxmiIiISNOYzBAREVGrff7557juuuuQnp4OSZLwwQcftPoYQgi88MILGDRoEIxGI3r27In//d//bfVxmMwQUZe2detWZGdnQ6/XY/r06di8eTMkSUJFRUW4Q/Pp27cvVqxYEe4wiFqlqqoKI0aMwKuvvhr0MX75y1/ijTfewAsvvIBvv/0W69evb7QSeyB0QUdARKQBCxYswKWXXoqPP/4YcXFxiImJQXFxMcxmc7hDI9K0adOmYdq0ac3udzgcePTRR/HOO++goqICl1xyCZ599llMnDgRAJCXl4eVK1fi4MGDGDx4MADv4q3BYM0MEXVpBQUF+OEPf4hevXohISEBBoMBqampkCSpyfIejweqqnZwlERdz3333Yft27djzZo12L9/P37yk59g6tSpOHbsGADgX//6F/r3748PP/wQ/fr1Q9++fTFnzhyUl5e3+rmYzBBFmIkTJ+KBBx7Ar3/9ayQmJiI1NRVPPvmkb39FRQXmzJmDHj16wGQy4Yc//CH27dsHALBYLFAUBTt37gQAqKqKxMREXHHFFb7H//Wvf0VGRkZAsZw+fRq33HILEhMTERsbi5EjR+Lrr7/27V+5ciUyMzNhMBgwePBgvPXWW36PlyQJb7zxBmbMmIGYmBgMHDgQ69evBwAcP34ckiShrKwMd911FyRJwurVqxs1M61evRoJCQlYv349hg4dCqPRiJMnT6Jv3774zW9+g9tvvx1xcXHo06cP1q9fj7Nnz+KGG25AXFwchg8f7jsXdb788kuMGzcO0dHRyMjIwAMPPICqqirf/tLSUlx33XWIjo5Gv3798Le//S2gc0WkJSdPnsSqVavw3nvvYdy4ccjMzMRDDz2EK6+8EqtWrQIAfPfddzhx4gTee+89vPnmm1i9ejV27dqFH//4x61/wjYtU0lEmjNhwgRhMpnEk08+KY4ePSr+8pe/CEmSfKsgT5o0SVx33XXim2++EUePHhULFy4U3bt3F2VlZUIIIS677DLx/PPPCyGE2Lt3r0hMTBQGg8G3OvucOXPEbbfddtE4KisrRf/+/cW4cePEF198IY4dOyb+/ve/i23btgkhhHj//feFXq8Xr776qjhy5Ij47W9/KxRFEf/5z398xwAgevXqJd5++21x7Ngx8cADD4i4uDhRVlYm3G63KC4uFiaTSaxYsUIUFxcLu93uWwX4/PnzQgghVq1aJfR6vRgzZozYunWr+Pbbb0VVVZXo06ePSExMFH/84x/F0aNHxbx584TJZBJTp04V7777rjhy5IiYPn26yMrKEqqqCiGEyM/PF7GxseKll14SR48eFVu3bhU5OTli9uzZvpinTZsmRowYIbZv3y527twpxowZI6Kjo8VLL73Utj8sURgBEGvXrvXd//DDDwUAERsb63fT6XTipptuEkIIMXfuXAHAb5X5Xbt2CQDi22+/bd3zh+RVEJFmTJgwQVx55ZV+237wgx+Ihx9+WHzxxRfCZDKJmpoav/2ZmZnitddeE0IIsWDBAnHttdcKIYRYsWKFuPnmm8WIESPExx9/LIQQYsCAAeL//u//LhrHa6+9JuLj431JUkNjxowRc+fO9dv2k5/8RFxzzTW++wDEY4895rtvs9kEAF8sQghhNpvFqlWrfPebSmYAiL179/o9V58+fcRPf/pT3/3i4mIBQDz++OO+bdu3bxcARHFxsRBCiLvvvlvcc889fsf54osvhCzLorq6Whw5ckQAEDt27PDtz8vLEwCYzJCmNUxm1qxZIxRFEd9++604duyY363u/8sTTzwhdDqd33HsdrsA4PtyFSh2ACaKQMOHD/e7n5aWhtLSUuzbtw82mw3du3f3219dXY2CggIAwIQJE/CnP/0JHo8HW7ZsweTJk5GamorNmzdj+PDhyM/P93Xwa8nevXuRk5ODxMTEJvfn5eXhnnvu8ds2duxY/O53v2v2tcTGxsJkMqG0tPSiz1+fwWBodE4aHjslJQUAkJ2d3WhbaWkpUlNTsW/fPuzfv9+v6UgIAVVVUVhYiKNHj0Kn0+Hyyy/37R8yZAgSEhJaFS9RZ5eTkwOPx4PS0lKMGzeuyTJjx46F2+1GQUEBMjMzAQBHjx4FAPTp06dVz8dkhigC6fV6v/uSJEFVVdhsNqSlpWHz5s2NHlP3gTt+/HhUVlZi9+7d+Pzzz/HMM88gNTUVy5cvx4gRI5Ceno6BAwdeNIbo6OhQvJRmX0trREdHN9khuP6x6/Y3ta3u+Ww2G+6991488MADjY7Vu3dv3xs1UVdgs9mQn5/vu19YWIi9e/ciMTERgwYNwm233Ybbb78dv/3tb5GTk4OzZ89i06ZNGD58OK699lpMmjQJl112Ge666y6sWLECqqpi/vz5+NGPfoRBgwa1KhZ2ACYin8suuwwlJSXQ6XQYMGCA3y0pKQmAN6kZPnw4XnnlFej1egwZMgTjx4/Hnj178OGHH2LChAkBPdfw4cOxd+/eZkcuZGVlYevWrX7btm7diqFDh7btRbajyy67DIcPH2507gYMGACDwYAhQ4bA7XZj165dvsccOXKkU815QxSonTt3IicnBzk5OQC80yDk5OTgiSeeAACsWrUKt99+OxYuXIjBgwdj+vTp+Oabb9C7d28AgCzL+Ne//oWkpCSMHz8e1157LbKysrBmzZpWx8KaGSLymTRpEkaPHo3p06fjueeew6BBg1BUVIR///vfmDFjBkaOHAnAOyLq5Zdf9o06SExMRFZWFv7+978HPIHWLbfcgmeeeQbTp0/HsmXLkJaWhj179iA9PR2jR4/Gr371K9x0003IycnBpEmT8K9//Qvvv/8+Pvvss3Z7/W318MMP44orrsB9992HOXPmIDY2FocPH8bGjRvxyiuvYPDgwZg6dSruvfderFy5EjqdDg8++GDIaqmIOtLEiRPh7S7TNL1ej6VLl2Lp0qXNlklPT8c///nPNsfCmhki8pEkCR999BHGjx+PO++8E4MGDcKsWbNw4sQJX/8QwNtvxuPx+PWNmThxYqNtLTEYDPj000+RnJyMa665BtnZ2Vi+fDkURQEATJ8+Hb/73e/wwgsvYNiwYXjttdewatWqgI8fDsOHD8eWLVtw9OhRjBs3zvctNT093Vdm1apVSE9Px4QJEzBz5kzcc889SE5ODmPURNoniZbSKiIiIqJOjjUzREREpGlMZoioXTzzzDOIi4tr8tbSei5ERK3FZiYiahfl5eXNjlSKjo5Gz549OzgiIuqqmMwQERGRprGZiYiIiDSNyQwRERFpGpMZIiIi0jQmM0RERKRpTGaIiIhI05jMEBERkaYxmSEiIiJNYzJDREREmvb/AWnb8RQ31dpTAAAAAElFTkSuQmCC", "text/plain": [ "
" ] @@ -692,7 +706,8 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", + "language": "python", "name": "python3" }, "language_info": { @@ -705,7 +720,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.12.6" } }, "nbformat": 4, From c3f51a2a8835c92e8f95bda315d18da20267890e Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 20 Mar 2025 15:57:50 -0700 Subject: [PATCH 10/39] chore: throws NotImplementedError when read_pandas with nested JSON type (#1516) * chore: throws NotImplementedError when read_pandas with nested JSON type * increase test coverage * fix python 3.6 --- bigframes/core/utils.py | 35 +++++++++++++++- tests/system/small/test_session.py | 64 ++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index e38c43e73e..684290bf81 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -18,10 +18,12 @@ from typing import Hashable, Iterable, List import warnings +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.common as vendored_pandas_io_common import numpy as np import pandas as pd import pandas.api.types as pdtypes +import pyarrow as pa import typing_extensions import bigframes.dtypes as dtypes @@ -243,6 +245,22 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: return updated_columns +def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool: + """ + Searches recursively for JSON array type within a PyArrow DataType. + """ + if arrow_type == dtypes.JSON_ARROW_TYPE: + return True + if pa.types.is_list(arrow_type): + return _search_for_nested_json_type(arrow_type.value_type) + if pa.types.is_struct(arrow_type): + for i in range(arrow_type.num_fields): + if _search_for_nested_json_type(arrow_type.field(i).type): + return True + return False + return False + + def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: """ Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249), @@ -253,12 +271,27 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: updated_columns = [] for col in dataframe.columns: - if dataframe[col].dtype == dtypes.JSON_DTYPE: + column_type = dataframe[col].dtype + if column_type == dtypes.JSON_DTYPE: dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE) updated_columns.append(col) + elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type( + column_type.pyarrow_dtype + ): + raise NotImplementedError( + f"Nested JSON types, found in column `{col}`: `{column_type}`', " + f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" + ) if dataframe.index.dtype == dtypes.JSON_DTYPE: dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE) updated_columns.append(dataframe.index.name) + elif isinstance( + dataframe.index.dtype, pd.ArrowDtype + ) and _search_for_nested_json_type(dataframe.index.dtype.pyarrow_dtype): + raise NotImplementedError( + f"Nested JSON types, found in the index: `{dataframe.index.dtype}`', " + f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" + ) return updated_columns diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 323d002df4..663e5e2f10 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -26,6 +26,8 @@ import google.cloud.bigquery as bigquery import numpy as np import pandas as pd +import pandas.arrays as arrays +import pyarrow as pa import pytest import bigframes @@ -829,6 +831,68 @@ def test_read_pandas_json_index(session, write_engine): pd.testing.assert_index_equal(actual_result, expected_index) +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + ], +) +def test_read_pandas_w_nested_json(session, write_engine): + data = [ + [{"json_field": "1"}], + [{"json_field": None}], + [{"json_field": '["1","3","5"]'}], + [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] + # PyArrow currently lacks support for creating structs or lists containing extension types. + # See issue: https://github.com/apache/arrow/issues/45262 + pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())]))) + pd_s = pd.Series( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"): + # Until b/401630655 is resolved, json not compatible with allow_large_results=False + session.read_pandas(pd_s, write_engine=write_engine).to_pandas( + allow_large_results=True + ) + + +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + ], +) +def test_read_pandas_w_nested_json_index(session, write_engine): + data = [ + [{"json_field": "1"}], + [{"json_field": None}], + [{"json_field": '["1","3","5"]'}], + [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] + # PyArrow currently lacks support for creating structs or lists containing extension types. + # See issue: https://github.com/apache/arrow/issues/45262 + pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())]))) + pd_idx: pd.Index = pd.Index( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + with pytest.raises( + NotImplementedError, match="Nested JSON types, found in the index" + ): + # Until b/401630655 is resolved, json not compatible with allow_large_results=False + session.read_pandas(pd_idx, write_engine=write_engine).to_pandas( + allow_large_results=True + ) + + @utils.skip_legacy_pandas @pytest.mark.parametrize( ("write_engine",), From 34ab9b8abd2c632c806afe69f00d9e7dddb6a8b5 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Thu, 20 Mar 2025 18:08:05 -0700 Subject: [PATCH 11/39] perf: update shape to use quer_and_wait (#1519) --- bigframes/session/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index e539525d80..1bea5cec02 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -462,7 +462,7 @@ def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: generate_row_count_plan(array_value.node) ) sql = self.compiler.compile(row_count_plan, ordered=False) - iter, _ = self._run_execute_query(sql) + iter, _ = self._run_execute_query(sql, query_with_job=False) return next(iter)[0] def cached( From 08ea02cee2e713e64aec9ae4b4760af9e748adad Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 21 Mar 2025 11:23:41 -0700 Subject: [PATCH 12/39] refactor: Separate dataset id generation from temp table management (#1520) --- bigframes/blob/_functions.py | 4 ++- bigframes/dataframe.py | 7 +++-- bigframes/session/__init__.py | 10 +++---- bigframes/session/executor.py | 10 ++++--- bigframes/session/loader.py | 10 ++++--- bigframes/session/temp_storage.py | 39 ++++++++++++++------------- tests/system/large/test_session.py | 29 +++++++++++--------- tests/system/small/test_encryption.py | 4 +-- 8 files changed, 63 insertions(+), 50 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index a3e7ae153c..6c9b435473 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -68,7 +68,9 @@ def _output_bq_type(self): def _create_udf(self): """Create Python UDF in BQ. Return name of the UDF.""" - udf_name = str(self._session._loader._storage_manager._random_table()) + udf_name = str( + self._session._loader._storage_manager.generate_unique_resource_id() + ) func_body = inspect.getsource(self._func) func_name = self._func.__name__ diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4955bb1295..1d3a45e879 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3760,10 +3760,9 @@ def to_gbq( ) if_exists = "replace" - temp_table_ref = self._session._temp_storage_manager._random_table( - # The client code owns this table reference now, so skip_cleanup=True - # to not clean it up when we close the session. - skip_cleanup=True, + # The client code owns this table reference now + temp_table_ref = ( + self._session._temp_storage_manager.generate_unique_resource_id() ) destination_table = f"{temp_table_ref.project}.{temp_table_ref.dataset_id}.{temp_table_ref.table_id}" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7b416d4424..acaad4a5b7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -248,7 +248,7 @@ def __init__( self._metrics = bigframes.session.metrics.ExecutionMetrics() self._function_session = bff_session.FunctionSession() self._temp_storage_manager = ( - bigframes.session.temp_storage.TemporaryGbqStorageManager( + bigframes.session.temp_storage.AnonymousDatasetManager( self._clients_provider.bqclient, location=self._location, session_id=self._session_id, @@ -908,7 +908,7 @@ def read_csv( engine=engine, write_engine=write_engine, ) - table = self._temp_storage_manager._random_table() + table = self._temp_storage_manager.allocate_temp_table() if engine is not None and engine == "bigquery": if any(param is not None for param in (dtype, names)): @@ -1054,7 +1054,7 @@ def read_parquet( engine=engine, write_engine=write_engine, ) - table = self._temp_storage_manager._random_table() + table = self._temp_storage_manager.allocate_temp_table() if engine == "bigquery": job_config = bigquery.LoadJobConfig() @@ -1108,7 +1108,7 @@ def read_json( engine=engine, write_engine=write_engine, ) - table = self._temp_storage_manager._random_table() + table = self._temp_storage_manager.allocate_temp_table() if engine == "bigquery": @@ -1704,7 +1704,7 @@ def _start_query_ml_ddl( def _create_object_table(self, path: str, connection: str) -> str: """Create a random id Object Table from the input path and connection.""" - table = str(self._loader._storage_manager._random_table()) + table = str(self._loader._storage_manager.generate_unique_resource_id()) import textwrap diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 1bea5cec02..07645c2a98 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -195,7 +195,7 @@ class BigQueryCachingExecutor(Executor): def __init__( self, bqclient: bigquery.Client, - storage_manager: bigframes.session.temp_storage.TemporaryGbqStorageManager, + storage_manager: bigframes.session.temp_storage.AnonymousDatasetManager, bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, *, strictly_ordered: bool = True, @@ -248,7 +248,7 @@ def execute( job_config = bigquery.QueryJobConfig() # Use explicit destination to avoid 10GB limit of temporary table if use_explicit_destination: - destination_table = self.storage_manager.create_temp_table( + destination_table = self.storage_manager.allocate_and_create_temp_table( array_value.schema.to_bigquery(), cluster_cols=[] ) job_config.destination = destination_table @@ -392,7 +392,7 @@ def peek( job_config = bigquery.QueryJobConfig() # Use explicit destination to avoid 10GB limit of temporary table if use_explicit_destination: - destination_table = self.storage_manager.create_temp_table( + destination_table = self.storage_manager.allocate_and_create_temp_table( array_value.schema.to_bigquery(), cluster_cols=[] ) job_config.destination = destination_table @@ -645,7 +645,9 @@ def _sql_as_cached_temp_table( cluster_cols: Sequence[str], ) -> bigquery.TableReference: assert len(cluster_cols) <= _MAX_CLUSTER_COLUMNS - temp_table = self.storage_manager.create_temp_table(schema, cluster_cols) + temp_table = self.storage_manager.allocate_and_create_temp_table( + schema, cluster_cols + ) # TODO: Get default job config settings job_config = cast( diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index b9859e92a2..1296e9d1b3 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -115,7 +115,7 @@ def __init__( self, session: bigframes.session.Session, bqclient: bigquery.Client, - storage_manager: bigframes.session.temp_storage.TemporaryGbqStorageManager, + storage_manager: bigframes.session.temp_storage.AnonymousDatasetManager, default_index_type: bigframes.enums.DefaultIndexKind, scan_index_uniqueness: bool, force_total_order: bool, @@ -167,7 +167,7 @@ def read_pandas_load_job( job_config.labels = {"bigframes-api": api_name} - load_table_destination = self._storage_manager._random_table() + load_table_destination = self._storage_manager.allocate_temp_table() load_job = self._bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -216,7 +216,7 @@ def read_pandas_streaming( index=True, ) - destination = self._storage_manager.create_temp_table( + destination = self._storage_manager.allocate_and_create_temp_table( schema, [ordering_col], ) @@ -673,7 +673,9 @@ def _query_to_destination( ) else: cluster_cols = [] - temp_table = self._storage_manager.create_temp_table(schema, cluster_cols) + temp_table = self._storage_manager.allocate_and_create_temp_table( + schema, cluster_cols + ) timeout_ms = configuration.get("jobTimeoutMs") or configuration["query"].get( "timeoutMs" diff --git a/bigframes/session/temp_storage.py b/bigframes/session/temp_storage.py index de764e4535..3b2965efef 100644 --- a/bigframes/session/temp_storage.py +++ b/bigframes/session/temp_storage.py @@ -24,7 +24,7 @@ _TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}" -class TemporaryGbqStorageManager: +class AnonymousDatasetManager: """ Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. """ @@ -46,20 +46,22 @@ def __init__( ) self.session_id = session_id - self._table_ids: List[str] = [] + self._table_ids: List[bigquery.TableReference] = [] self._kms_key = kms_key - def create_temp_table( + def allocate_and_create_temp_table( self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] ) -> bigquery.TableReference: - # Can't set a table in _SESSION as destination via query job API, so we - # run DDL, instead. + """ + Allocates and and creates a table in the anonymous dataset. + The table will be cleaned up by clean_up_tables. + """ expiration = ( datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION ) table = bf_io_bigquery.create_temp_table( self.bqclient, - self._random_table(), + self.allocate_temp_table(), expiration, schema=schema, cluster_columns=list(cluster_cols), @@ -67,11 +69,19 @@ def create_temp_table( ) return bigquery.TableReference.from_string(table) - def _random_table(self, skip_cleanup: bool = False) -> bigquery.TableReference: + def allocate_temp_table(self) -> bigquery.TableReference: + """ + Allocates a unique table id, but does not create the table. + The table will be cleaned up by clean_up_tables. + """ + table_id = self.generate_unique_resource_id() + self._table_ids.append(table_id) + return table_id + + def generate_unique_resource_id(self) -> bigquery.TableReference: """Generate a random table ID with BigQuery DataFrames prefix. - The generated ID will be stored and checked for deletion when the - session is closed, unless skip_cleanup is True. + This resource will not be cleaned up by this manager. Args: skip_cleanup (bool, default False): @@ -87,16 +97,9 @@ def _random_table(self, skip_cleanup: bool = False) -> bigquery.TableReference: table_id = _TEMP_TABLE_ID_FORMAT.format( date=now.strftime("%Y%m%d"), session_id=self.session_id, random_id=random_id ) - if not skip_cleanup: - self._table_ids.append(table_id) return self.dataset.table(table_id) def clean_up_tables(self): """Delete tables that were created with this session's session_id.""" - client = self.bqclient - project_id = self.dataset.project - dataset_id = self.dataset.dataset_id - - for table_id in self._table_ids: - full_id = ".".join([project_id, dataset_id, table_id]) - client.delete_table(full_id, not_found_ok=True) + for table_ref in self._table_ids: + self.bqclient.delete_table(table_ref, not_found_ok=True) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index 7f13462cbe..e117cf0327 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -14,6 +14,7 @@ import datetime +import google.cloud.bigquery as bigquery import google.cloud.exceptions import pytest @@ -70,10 +71,14 @@ def test_close(session: bigframes.Session): + bigframes.constants.DEFAULT_EXPIRATION ) full_id_1 = bigframes.session._io.bigquery.create_temp_table( - session.bqclient, session._temp_storage_manager._random_table(), expiration + session.bqclient, + session._temp_storage_manager.allocate_temp_table(), + expiration, ) full_id_2 = bigframes.session._io.bigquery.create_temp_table( - session.bqclient, session._temp_storage_manager._random_table(), expiration + session.bqclient, + session._temp_storage_manager.allocate_temp_table(), + expiration, ) # check that the tables were actually created @@ -106,10 +111,14 @@ def test_clean_up_by_session_id(): + bigframes.constants.DEFAULT_EXPIRATION ) bigframes.session._io.bigquery.create_temp_table( - session.bqclient, session._temp_storage_manager._random_table(), expiration + session.bqclient, + session._temp_storage_manager.allocate_temp_table(), + expiration, ) bigframes.session._io.bigquery.create_temp_table( - session.bqclient, session._temp_storage_manager._random_table(), expiration + session.bqclient, + session._temp_storage_manager.allocate_temp_table(), + expiration, ) # check that some table exists with the expected session_id @@ -148,15 +157,11 @@ def test_clean_up_via_context_manager(session_creator): with session_creator() as session: bqclient = session.bqclient - expiration = ( - datetime.datetime.now(datetime.timezone.utc) - + bigframes.constants.DEFAULT_EXPIRATION + full_id_1 = session._temp_storage_manager.allocate_and_create_temp_table( + [bigquery.SchemaField("a", "INT64")], cluster_cols=[] ) - full_id_1 = bigframes.session._io.bigquery.create_temp_table( - session.bqclient, session._temp_storage_manager._random_table(), expiration - ) - full_id_2 = bigframes.session._io.bigquery.create_temp_table( - session.bqclient, session._temp_storage_manager._random_table(), expiration + full_id_2 = session._temp_storage_manager.allocate_and_create_temp_table( + [bigquery.SchemaField("b", "STRING")], cluster_cols=["b"] ) # check that the tables were actually created diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 72529bc5b0..8ce53c218b 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -89,7 +89,7 @@ def test_session_load_job(bq_cmek, session_with_bq_cmek): pytest.skip("no cmek set for testing") # pragma: NO COVER # Session should have cmek set in the default query and load job configs - load_table = session_with_bq_cmek._temp_storage_manager._random_table() + load_table = session_with_bq_cmek._temp_storage_manager.allocate_temp_table() df = pandas.DataFrame({"col0": [1, 2, 3]}) load_job_config = bigquery.LoadJobConfig() @@ -194,7 +194,7 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): # Write the result to BQ custom table and assert encryption session_with_bq_cmek.bqclient.get_table(output_table_id) - output_table_ref = session_with_bq_cmek._temp_storage_manager._random_table() + output_table_ref = session_with_bq_cmek._temp_storage_manager.allocate_temp_table() output_table_id = str(output_table_ref) df.to_gbq(output_table_id) output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) From e9fe8154d83e2674a05d7b670e949368b175ec8b Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Fri, 21 Mar 2025 14:35:24 -0500 Subject: [PATCH 13/39] feat: add `GeoSeries.difference()` and `bigframes.bigquery.st_difference()` (#1471) * feat: add GeoSeries.difference() * add st_difference method and test cases for geo.difference and st_difference * update method and tests * update method and tests * update test to cover different inputs * update test assertion * update testing with single geometry object and add notebook tag to ignore the exception * fix docstrings * modify geo_difference to make it available for use and update tests and notebook * fix my py type error --- bigframes/bigquery/__init__.py | 3 +- bigframes/bigquery/_operations/geo.py | 120 ++++++ bigframes/core/compile/scalar_op_compiler.py | 17 +- bigframes/geopandas/geoseries.py | 5 +- bigframes/operations/__init__.py | 2 + bigframes/operations/geo_ops.py | 5 +- bigframes/operations/type.py | 14 + notebooks/geo/geoseries.ipynb | 386 ++++++++++++++---- tests/system/small/bigquery/test_geo.py | 96 ++++- .../system/small/geopandas/test_geoseries.py | 97 ++++- .../bigframes_vendored/geopandas/geoseries.py | 113 +++++ 11 files changed, 758 insertions(+), 100 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 56aee38bfe..a35e898c2d 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -27,7 +27,7 @@ unix_millis, unix_seconds, ) -from bigframes.bigquery._operations.geo import st_area +from bigframes.bigquery._operations.geo import st_area, st_difference from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -48,6 +48,7 @@ "array_to_string", # geo ops "st_area", + "st_difference", # json ops "json_set", "json_extract", diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 7b8e47e2da..a41c33f67d 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -15,6 +15,7 @@ from __future__ import annotations from bigframes import operations as ops +import bigframes.dtypes import bigframes.geopandas import bigframes.series @@ -91,3 +92,122 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series: series = series._apply_unary_op(ops.geo_area_op) series.name = None return series + + +def st_difference( + series: bigframes.series.Series, other: bigframes.series.Series +) -> bigframes.series.Series: + """ + Returns a GEOGRAPHY that represents the point set difference of + `geography_1` and `geography_2`. Therefore, the result consists of the part + of `geography_1` that doesn't intersect with `geography_2`. + + If `geometry_1` is completely contained in `geometry_2`, then ST_DIFFERENCE + returns an empty GEOGRAPHY. + + ..note:: + BigQuery's Geography functions, like `st_difference`, interpret the geometry + data type as a point set on the Earth's surface. A point set is a set + of points, lines, and polygons on the WGS84 reference spheroid, with + geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.bigquery as bbq + >>> import bigframes.geopandas + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + We can check two GeoSeries against each other, row by row. + + >>> s1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(0, 1), + ... ], + ... ) + >>> s2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (1, 1), (0, 1)]), + ... LineString([(1, 0), (1, 3)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(1, 1), + ... Point(0, 1), + ... ], + ... index=range(1, 6), + ... ) + + >>> s1 + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 2 LINESTRING (0 0, 2 2) + 3 LINESTRING (2 0, 0 2) + 4 POINT (0 1) + dtype: geometry + + >>> s2 + 1 POLYGON ((0 0, 1 1, 0 1, 0 0)) + 2 LINESTRING (1 0, 1 3) + 3 LINESTRING (2 0, 0 2) + 4 POINT (1 1) + 5 POINT (0 1) + dtype: geometry + + >>> bbq.st_difference(s1, s2) + 0 None + 1 POLYGON ((0.99954 1, 2 2, 0 2, 0 1, 0.99954 1)) + 2 LINESTRING (0 0, 1 1.00046, 2 2) + 3 GEOMETRYCOLLECTION EMPTY + 4 POINT (0 1) + 5 None + dtype: geometry + + We can also check difference of single shapely geometries: + + >>> sbq1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) + ... ] + ... ) + >>> sbq2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) + ... ] + ... ) + + >>> sbq1 + 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) + dtype: geometry + + >>> sbq2 + 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) + dtype: geometry + + >>> bbq.st_difference(sbq1, sbq2) + 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... + dtype: geometry + + Additionally, we can check difference of a GeoSeries against a single shapely geometry: + + >>> bbq.st_difference(s1, sbq2) + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 None + 2 None + 3 None + 4 None + dtype: geometry + + Args: + other (bigframes.series.Series or geometric object): + The GeoSeries (elementwise) or geometric object to find the difference to. + + Returns: + bigframes.series.Series: + A GeoSeries of the points in each aligned geometry that are not + in other. + """ + return series._apply_binary_op(other, ops.geo_st_difference_op) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 35a307722f..ce0cd6c37a 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1001,11 +1001,6 @@ def normalize_op_impl(x: ibis_types.Value): # Geo Ops -@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) -def geo_st_boundary_op_impl(x: ibis_types.Value): - return st_boundary(x) - - @scalar_op_compiler.register_unary_op(ops.geo_area_op) def geo_area_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).area() @@ -1016,6 +1011,18 @@ def geo_st_astext_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).as_text() +@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) +def geo_st_boundary_op_impl(x: ibis_types.Value): + return st_boundary(x) + + +@scalar_op_compiler.register_binary_op(ops.geo_st_difference_op, pass_op=False) +def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).difference( + typing.cast(ibis_types.GeoSpatialValue, y) + ) + + @scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) def geo_st_geogfromtext_op_impl(x: ibis_types.Value): # Ibis doesn't seem to provide a dedicated method to cast from string to geography, diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index 44018b8c5c..6c9cb77a08 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -62,7 +62,7 @@ def area(self, crs=None) -> bigframes.series.Series: # type: ignore Raises: NotImplementedError: - GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead. + GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. """ raise NotImplementedError( f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" @@ -93,3 +93,6 @@ def to_wkt(self: GeoSeries) -> bigframes.series.Series: series = self._apply_unary_op(ops.geo_st_astext_op) series.name = None return series + + def difference(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore + return self._apply_binary_op(other, ops.geo_st_difference_op) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 83cefbe6ba..2b4c9ca892 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -90,6 +90,7 @@ geo_area_op, geo_st_astext_op, geo_st_boundary_op, + geo_st_difference_op, geo_st_geogfromtext_op, geo_st_geogpoint_op, geo_x_op, @@ -366,6 +367,7 @@ # Geo ops "geo_area_op", "geo_st_boundary_op", + "geo_st_difference_op", "geo_st_astext_op", "geo_st_geogfromtext_op", "geo_st_geogpoint_op", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 9ef0983e24..3cf248bddb 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -37,6 +37,10 @@ ), ) +geo_st_difference_op = base_ops.create_binary_op( + name="geo_st_difference", type_signature=op_typing.BinaryGeo() +) + geo_st_geogfromtext_op = base_ops.create_unary_op( name="geo_st_geogfromtext", type_signature=op_typing.FixedOutputType( @@ -44,7 +48,6 @@ ), ) - geo_st_geogpoint_op = base_ops.create_binary_op( name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() ) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 0a47cd91f0..b4029d74c7 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -122,6 +122,20 @@ def output_type( @dataclasses.dataclass +@dataclasses.dataclass +class BinaryGeo(BinaryTypeSignature): + """Type signature for geo functions like difference that can map geo to geo.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_geo_like(left_type): + raise TypeError(f"Type {left_type} is not geo") + if (right_type is not None) and not bigframes.dtypes.is_geo_like(right_type): + raise TypeError(f"Type {right_type} is not numeric") + return bigframes.dtypes.GEO_DTYPE + + class BinaryNumericGeo(BinaryTypeSignature): """Type signature for geo functions like from_xy that can map ints to ints.""" diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index 7060128bf6..e7566f3fa6 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -49,14 +49,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: \u001b[93mTable 'bigquery-public-data.geo_us_boundaries.counties' is clustered\n", + "and/or partitioned, but BigQuery DataFrames was not able to find a\n", + "suitable index. To avoid this warning, set at least one of:\n", + "`index_col` or `filters`.\u001b[0m\n", " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" ] } @@ -74,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -97,21 +100,21 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "137 POINT (-86.87338 38.37334)\n", - "164 POINT (-118.48037 46.25461)\n", - "333 POINT (-92.5617 32.30429)\n", - "703 POINT (-83.46189 39.55525)\n", - "846 POINT (-119.46779 47.21363)\n", + "217 POINT (-86.80185 38.70532)\n", + "16 POINT (-83.47042 30.44723)\n", + "40 POINT (-94.33925 38.25722)\n", + "139 POINT (-78.88532 38.50758)\n", + "400 POINT (-95.6191 41.0337)\n", "Name: int_point_geom, dtype: geometry" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -130,21 +133,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 POINT (-94.33925 38.25722)\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", "dtype: geometry" ] }, - "execution_count": 11, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -179,21 +182,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 -86.873385\n", - "1 -118.48037\n", - "2 -92.5617\n", - "3 -83.461893\n", - "4 -119.467788\n", + "0 -86.801847\n", + "1 -83.470416\n", + "2 -94.339246\n", + "3 -78.885321\n", + "4 -95.619101\n", "dtype: Float64" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -211,21 +214,21 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 38.373344\n", - "1 46.254606\n", - "2 32.30429\n", - "3 39.555246\n", - "4 47.213633\n", + "0 38.705322\n", + "1 30.447232\n", + "2 38.257217\n", + "3 38.507585\n", + "4 41.033703\n", "dtype: Float64" ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -250,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -284,7 +287,7 @@ "dtype: Float64" ] }, - "execution_count": 14, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -302,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -336,7 +339,7 @@ "dtype: Float64" ] }, - "execution_count": 15, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -361,21 +364,21 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "78 POLYGON ((-95.97154 44.6306, -95.97919 44.6305...\n", - "130 POLYGON ((-95.0933 41.77694, -95.09331 41.7764...\n", - "544 POLYGON ((-96.0664 40.43618, -96.06639 40.4352...\n", - "995 POLYGON ((-101.83583 47.49547, -101.83665 47.4...\n", - "1036 POLYGON ((-88.42474 37.15094, -88.42526 37.149...\n", + "214 POLYGON ((-79.36704 34.96248, -79.36696 34.962...\n", + "161 POLYGON ((-89.08844 33.53252, -89.08843 33.532...\n", + "57 POLYGON ((-110.75069 35.50001, -110.75069 35.4...\n", + "46 POLYGON ((-94.6865 39.04405, -94.68764 39.0440...\n", + "260 POLYGON ((-100.53965 34.99391, -100.53966 34.9...\n", "Name: county_geom, dtype: geometry" ] }, - "execution_count": 16, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -394,21 +397,21 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POLYGON ((-95.97154 44.6306, -95.97919 44.6305...\n", - "1 POLYGON ((-95.0933 41.77694, -95.09331 41.7764...\n", - "2 POLYGON ((-96.0664 40.43618, -96.06639 40.4352...\n", - "3 POLYGON ((-101.83583 47.49547, -101.83665 47.4...\n", - "4 POLYGON ((-88.42474 37.15094, -88.42526 37.149...\n", + "0 POLYGON ((-79.36704 34.96248, -79.36696 34.962...\n", + "1 POLYGON ((-89.08844 33.53252, -89.08843 33.532...\n", + "2 POLYGON ((-110.75069 35.50001, -110.75069 35.4...\n", + "3 POLYGON ((-94.6865 39.04405, -94.68764 39.0440...\n", + "4 POLYGON ((-100.53965 34.99391, -100.53966 34.9...\n", "dtype: geometry" ] }, - "execution_count": 17, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +436,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": { "tags": [ "raises-exception" @@ -442,14 +445,14 @@ "outputs": [ { "ename": "NotImplementedError", - "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 1.40.0.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", - "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:67\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 69\u001b[0m )\n", - "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0" + "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", + "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:67\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead.\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 69\u001b[0m )\n", + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 1.40.0." ] } ], @@ -461,12 +464,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Use `bigframes.bigquery.st_area` to retirive the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" + "### 3. Use `bigframes.bigquery.st_area` to retrieve the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -475,21 +478,21 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 1865212769.084914\n", - "1 1146753653.723439\n", - "2 1059653048.84506\n", - "3 2873655557.502374\n", - "4 886267772.361455\n", + "0 1014426111.476457\n", + "1 1196896004.730286\n", + "2 25794235993.165642\n", + "3 1242002056.351685\n", + "4 2381217221.963739\n", "dtype: Float64" ] }, - "execution_count": 20, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -515,21 +518,21 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 POINT (-94.33925 38.25722)\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", "dtype: geometry" ] }, - "execution_count": 21, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -554,21 +557,21 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT(-86.8733845 38.3733441)\n", - "1 POINT(-118.4803697 46.2546057)\n", - "2 POINT(-92.5616997 32.3042901)\n", - "3 POINT(-83.4618927 39.5552462)\n", - "4 POINT(-119.467788 47.2136328)\n", + "0 POINT(-86.8018468 38.705322)\n", + "1 POINT(-83.4704159 30.4472325)\n", + "2 POINT(-94.3392459 38.2572171)\n", + "3 POINT(-78.8853213 38.5075848)\n", + "4 POINT(-95.619101 41.0337028)\n", "dtype: string" ] }, - "execution_count": 22, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -594,21 +597,21 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 POINT (-94.33925 38.25722)\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", "dtype: geometry" ] }, - "execution_count": 23, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -627,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -641,7 +644,7 @@ "dtype: geometry" ] }, - "execution_count": 24, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -662,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -676,7 +679,7 @@ "dtype: geometry" ] }, - "execution_count": 25, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -684,6 +687,209 @@ "source": [ "geom_obj.geo.boundary" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find the `difference` between two `GeoSeries` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reuse `wkts_from_geo` and `geom_obj` to find the difference between the geometry objects" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", + "dtype: geometry" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wkts_from_geo.difference(geom_obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference between a `GeoSeries` and a single geometry shape." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + "dtype: geometry" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wkts_from_geo.difference([Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference in `GeoSeries` with the same shapes" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 GEOMETRYCOLLECTION EMPTY\n", + "1 GEOMETRYCOLLECTION EMPTY\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 GEOMETRYCOLLECTION EMPTY\n", + "4 GEOMETRYCOLLECTION EMPTY\n", + "dtype: geometry" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_obj.difference(geom_obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## You can also use`BigQuery.st_difference()` to find the difference between two `GeoSeries`. See, https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_difference" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", + "dtype: geometry" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.st_difference(wkts_from_geo, geom_obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference between a `GeoSeries` and a single geometry shape." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + "dtype: geometry" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.st_difference(wkts_from_geo, [Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference in GeoSeries with the same shapes" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 GEOMETRYCOLLECTION EMPTY\n", + "1 GEOMETRYCOLLECTION EMPTY\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 GEOMETRYCOLLECTION EMPTY\n", + "4 GEOMETRYCOLLECTION EMPTY\n", + "dtype: geometry" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.st_difference(geom_obj, geom_obj)" + ] } ], "metadata": { diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py index 7d38cd7d91..538099e80a 100644 --- a/tests/system/small/bigquery/test_geo.py +++ b/tests/system/small/bigquery/test_geo.py @@ -14,7 +14,12 @@ import geopandas # type: ignore import pandas as pd -from shapely.geometry import LineString, Point, Polygon # type: ignore +from shapely.geometry import ( # type: ignore + GeometryCollection, + LineString, + Point, + Polygon, +) import bigframes.bigquery as bbq import bigframes.geopandas @@ -51,3 +56,92 @@ def test_geo_st_area(): check_exact=False, rtol=1, ) + + +def test_geo_st_difference_with_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + Point(0, 1), + ] + + data2 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + LineString([(2, 0), (0, 2)]), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s2 = bigframes.geopandas.GeoSeries(data=data2) + geobf_s_result = bbq.st_difference(geobf_s1, geobf_s2).to_pandas() + + expected = bigframes.series.Series( + [ + GeometryCollection([]), + GeometryCollection([]), + Point(0, 1), + ], + index=[0, 1, 2], + dtype=geopandas.array.GeometryDtype(), + ).to_pandas() + + assert geobf_s_result.dtype == "geometry" + assert expected.iloc[0].equals(geobf_s_result.iloc[0]) + assert expected.iloc[1].equals(geobf_s_result.iloc[1]) + assert expected.iloc[2].equals(geobf_s_result.iloc[2]) + + +def test_geo_st_difference_with_single_geometry_object(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + Point(0, 1), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s_result = bbq.st_difference( + geobf_s1, + bigframes.geopandas.GeoSeries( + [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(1, 0), (0, 5), (0, 0), (1, 0)]), + ] + ), + ).to_pandas() + + expected = bigframes.series.Series( + [ + GeometryCollection([]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + None, + ], + index=[0, 1, 2], + dtype=geopandas.array.GeometryDtype(), + ).to_pandas() + + assert geobf_s_result.dtype == "geometry" + assert (expected.iloc[0]).equals(geobf_s_result.iloc[0]) + assert expected.iloc[1] == geobf_s_result.iloc[1] + assert expected.iloc[2] == geobf_s_result.iloc[2] + + +def test_geo_st_difference_with_similar_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s_result = bbq.st_difference(geobf_s1, geobf_s1).to_pandas() + + expected = bigframes.series.Series( + [GeometryCollection([]), GeometryCollection([]), GeometryCollection([])], + index=[0, 1, 2], + dtype=geopandas.array.GeometryDtype(), + ).to_pandas() + + assert geobf_s_result.dtype == "geometry" + assert expected.iloc[0].equals(geobf_s_result.iloc[0]) + assert expected.iloc[1].equals(geobf_s_result.iloc[1]) + assert expected.iloc[2].equals(geobf_s_result.iloc[2]) diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index d0987dbdaf..fdd9826468 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -20,7 +20,12 @@ import google.api_core.exceptions import pandas as pd import pytest -from shapely.geometry import LineString, Point, Polygon # type: ignore +from shapely.geometry import ( # type: ignore + GeometryCollection, + LineString, + Point, + Polygon, +) import bigframes.geopandas import bigframes.series @@ -194,3 +199,93 @@ def test_geo_boundary(): check_series_type=False, check_index=False, ) + + +# the GeoSeries and GeoPandas results are not always the same. +# For example, when the difference between two polygons is empty, +# GeoPandas returns 'POLYGON EMPTY' while GeoSeries returns 'GeometryCollection([])'. +# This is why we are hard-coding the expected results. +def test_geo_difference_with_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + Point(0, 1), + ] + + data2 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + LineString([(2, 0), (0, 2)]), + ] + + bf_s1 = bigframes.geopandas.GeoSeries(data=data1) + bf_s2 = bigframes.geopandas.GeoSeries(data=data2) + + bf_result = bf_s1.difference(bf_s2).to_pandas() + + expected = bigframes.geopandas.GeoSeries( + [ + Polygon([]), + Polygon([]), + Point(0, 1), + ], + index=[0, 1, 2], + ).to_pandas() + + assert bf_result.dtype == "geometry" + assert expected.iloc[0].equals(bf_result.iloc[0]) + assert expected.iloc[1].equals(bf_result.iloc[1]) + assert expected.iloc[2].equals(bf_result.iloc[2]) + + +def test_geo_difference_with_single_geometry_object(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + Point(0, 1), + ] + + bf_s1 = bigframes.geopandas.GeoSeries(data=data1) + bf_result = bf_s1.difference( + bigframes.geopandas.GeoSeries( + [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(1, 0), (0, 5), (0, 0), (1, 0)]), + ] + ), + ).to_pandas() + + expected = bigframes.geopandas.GeoSeries( + [ + GeometryCollection([]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + None, + ], + index=[0, 1, 2], + ).to_pandas() + + assert bf_result.dtype == "geometry" + assert (expected.iloc[0]).equals(bf_result.iloc[0]) + assert expected.iloc[1] == bf_result.iloc[1] + assert expected.iloc[2] == bf_result.iloc[2] + + +def test_geo_difference_with_similar_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + + bf_s1 = bigframes.geopandas.GeoSeries(data=data1) + bf_result = bf_s1.difference(bf_s1).to_pandas() + + expected = bigframes.geopandas.GeoSeries( + [GeometryCollection([]), GeometryCollection([]), GeometryCollection([])], + index=[0, 1, 2], + ).to_pandas() + + assert bf_result.dtype == "geometry" + assert expected.iloc[0].equals(bf_result.iloc[0]) + assert expected.iloc[1].equals(bf_result.iloc[1]) + assert expected.iloc[2].equals(bf_result.iloc[2]) diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index a2e7b74059..b00d4220ff 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -239,3 +239,116 @@ def to_wkt(self) -> bigframes.series.Series: WKT representations of the geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore + """ + Returns a GeoSeries of the points in each aligned geometry that are not + in other. + + The operation works on a 1-to-1 row-wise manner + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.geopandas + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + We can check two GeoSeries against each other, row by row. + + >>> s1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(0, 1), + ... ], + ... ) + >>> s2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (1, 1), (0, 1)]), + ... LineString([(1, 0), (1, 3)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(1, 1), + ... Point(0, 1), + ... ], + ... index=range(1, 6), + ... ) + + >>> s1 + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 2 LINESTRING (0 0, 2 2) + 3 LINESTRING (2 0, 0 2) + 4 POINT (0 1) + dtype: geometry + + >>> s2 + 1 POLYGON ((0 0, 1 1, 0 1, 0 0)) + 2 LINESTRING (1 0, 1 3) + 3 LINESTRING (2 0, 0 2) + 4 POINT (1 1) + 5 POINT (0 1) + dtype: geometry + + >>> s1.difference(s2) + 0 None + 1 POLYGON ((0.99954 1, 2 2, 0 2, 0 1, 0.99954 1)) + 2 LINESTRING (0 0, 1 1.00046, 2 2) + 3 GEOMETRYCOLLECTION EMPTY + 4 POINT (0 1) + 5 None + dtype: geometry + + We can also check difference of single shapely geometries: + + >>> sbq1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) + ... ] + ... ) + >>> sbq2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) + ... ] + ... ) + + >>> sbq1 + 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) + dtype: geometry + + >>> sbq2 + 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) + dtype: geometry + + >>> sbq1.difference(sbq2) + 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... + dtype: geometry + + Additionally, we can check difference of a GeoSeries against a single shapely geometry: + + >>> s1.difference(sbq2) + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 None + 2 None + 3 None + 4 None + dtype: geometry + + Args: + other (GeoSeries or geometric object): + The GeoSeries (elementwise) or geometric object to find the + difference to. + + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of the points in each aligned geometry that are not + in other. + + Raises: + NotImplementedError: + GeoSeries.difference is not supported. Use + bigframes.bigquery.st_difference(series), instead. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From ab2455f7d6b3efea46ebf9a42836d6b8af09f6f2 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 21 Mar 2025 14:42:47 -0700 Subject: [PATCH 14/39] test: fix json tests fail locally with disable allow_large_results (#1523) * test: fix json tests fail locally with disable allow_large_results * fix test_read_gbq_w_json_in_array * move test_read_gbq_w_json* to test_session.py --- tests/system/small/bigquery/test_json.py | 91 +++++++------- tests/system/small/test_dataframe_io.py | 138 --------------------- tests/system/small/test_session.py | 149 +++++++++++++++++++++++ tests/unit/bigquery/__init__.py | 13 ++ tests/unit/bigquery/test_json.py | 26 ++++ 5 files changed, 238 insertions(+), 179 deletions(-) create mode 100644 tests/unit/bigquery/__init__.py create mode 100644 tests/unit/bigquery/test_json.py diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 16f66dae57..57fc878643 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -22,13 +22,6 @@ import bigframes.pandas as bpd -@pytest.fixture(scope="module", autouse=True) -def use_large_query_path(): - # b/401630655 - with bpd.option_context("bigquery.allow_large_results", True): - yield - - @pytest.mark.parametrize( ("json_path", "expected_json"), [ @@ -39,12 +32,14 @@ def use_large_query_path(): def test_json_set_at_json_path(json_path, expected_json): original_json = ['{"a": {"b": {"c": "tester", "d": []}}}'] s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) - actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) + actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -63,11 +58,12 @@ def test_json_set_at_json_value_type(json_value, expected_json): original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}'] s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) - expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -80,18 +76,14 @@ def test_json_set_w_more_pairs(): expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) -def test_json_set_w_invalid_json_path_value_pairs(): - s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) - with pytest.raises(ValueError): - bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore - - def test_json_set_w_invalid_value_type(): s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) with pytest.raises(TypeError): @@ -119,11 +111,13 @@ def test_json_extract_from_json(): ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract(s, "$.a.b").to_pandas() - expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas() + actual = bbq.json_extract(s, "$.a.b") + expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual, - expected, + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -134,9 +128,11 @@ def test_json_extract_from_string(): ) actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -169,9 +165,10 @@ def test_json_extract_array_from_json(): expected.index.name = None expected.name = None + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -185,9 +182,11 @@ def test_json_extract_array_from_json_strings(): [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], dtype=pd.ArrowDtype(pa.list_(pa.string())), ) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -201,9 +200,11 @@ def test_json_extract_array_from_json_array_strings(): [["1", "2", "3"], [], ["4", "5"]], dtype=pd.ArrowDtype(pa.list_(pa.string())), ) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -217,9 +218,11 @@ def test_json_extract_string_array_from_json_strings(): s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) actual = bbq.json_extract_string_array(s, "$.a") expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -227,9 +230,11 @@ def test_json_extract_string_array_from_array_strings(): s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) actual = bbq.json_extract_string_array(s) expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -237,17 +242,21 @@ def test_json_extract_string_array_as_float_array_from_array_strings(): s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"]) actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE) expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) def test_json_extract_string_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_extract_string_array(bpd.Series([1, 2])) + bbq.json_extract_string_array(s) def test_parse_json_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.parse_json(bpd.Series([1, 2])) + bbq.parse_json(s) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d2ba96b41f..cd21f5094c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -14,7 +14,6 @@ from typing import Tuple -import db_dtypes # type:ignore import google.api_core.exceptions import pandas as pd import pandas.testing @@ -281,143 +280,6 @@ def test_to_arrow_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id -def test_load_json_w_json_string_items(session): - sql = """ - SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, - UNION ALL - SELECT 1, JSON_OBJECT('int', 100), - UNION ALL - SELECT 2, JSON_OBJECT('float', 0.98), - UNION ALL - SELECT 3, JSON_OBJECT('string', 'hello world'), - UNION ALL - SELECT 4, JSON_OBJECT('array', [8, 9, 10]), - UNION ALL - SELECT 5, JSON_OBJECT('null', null), - UNION ALL - SELECT 6, JSON_OBJECT('b', 2, 'a', 1), - UNION ALL - SELECT - 7, - JSON_OBJECT( - 'dict', - JSON_OBJECT( - 'int', 1, - 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] - ) - ), - """ - df = session.read_gbq(sql, index_col="id") - - assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - - assert df["json_col"][0] == '{"boolean":true}' - assert df["json_col"][1] == '{"int":100}' - assert df["json_col"][2] == '{"float":0.98}' - assert df["json_col"][3] == '{"string":"hello world"}' - assert df["json_col"][4] == '{"array":[8,9,10]}' - assert df["json_col"][5] == '{"null":null}' - - # Verifies JSON strings preserve array order, regardless of dictionary key order. - assert df["json_col"][6] == '{"a":1,"b":2}' - assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' - - -def test_load_json_to_pandas_has_correct_result(session): - df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") - assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - result = df.to_pandas() - - # These JSON strings are compatible with BigQuery's JSON storage, - pd_df = pd.DataFrame( - {"json_col": ['{"bar":true,"foo":10}']}, - dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), - ) - pd_df.index = pd_df.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) - pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) - - -def test_load_json_in_struct(session): - """Avoid regressions for internal issue 381148539.""" - sql = """ - SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col - UNION ALL - SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2), - UNION ALL - SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3), - UNION ALL - SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4), - UNION ALL - SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5), - UNION ALL - SELECT 5, STRUCT(JSON_OBJECT('null', null), 6), - UNION ALL - SELECT - 6, - STRUCT(JSON_OBJECT( - 'dict', - JSON_OBJECT( - 'int', 1, - 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] - ) - ), 7), - """ - df = session.read_gbq(sql, index_col="id") - - assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype) - assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) - - data = df["struct_col"].struct.field("data") - assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - - assert data[0] == '{"boolean":true}' - assert data[1] == '{"int":100}' - assert data[2] == '{"float":0.98}' - assert data[3] == '{"string":"hello world"}' - assert data[4] == '{"array":[8,9,10]}' - assert data[5] == '{"null":null}' - assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' - - -def test_load_json_in_array(session): - sql = """ - SELECT - 0 AS id, - [ - JSON_OBJECT('boolean', True), - JSON_OBJECT('int', 100), - JSON_OBJECT('float', 0.98), - JSON_OBJECT('string', 'hello world'), - JSON_OBJECT('array', [8, 9, 10]), - JSON_OBJECT('null', null), - JSON_OBJECT( - 'dict', - JSON_OBJECT( - 'int', 1, - 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] - ) - ) - ] AS array_col, - """ - df = session.read_gbq(sql, index_col="id") - - assert isinstance(df.dtypes["array_col"], pd.ArrowDtype) - assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType) - - data = df["array_col"].list - assert data.len()[0] == 7 - assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - - assert data[0][0] == '{"boolean":true}' - assert data[1][0] == '{"int":100}' - assert data[2][0] == '{"float":0.98}' - assert data[3][0] == '{"string":"hello world"}' - assert data[4][0] == '{"array":[8,9,10]}' - assert data[5][0] == '{"null":null}' - assert data[6][0] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' - - def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): """Verify to_pandas_batches() APIs returns the expected dtypes.""" expected = scalars_df_default_index.dtypes diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 663e5e2f10..e286c40450 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -22,6 +22,7 @@ import warnings import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq +import db_dtypes # type:ignore import google import google.cloud.bigquery as bigquery import numpy as np @@ -603,6 +604,154 @@ def test_read_gbq_external_table(session: bigframes.Session): assert df["i1"].max() == 99 +def test_read_gbq_w_json(session): + sql = """ + SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, + UNION ALL + SELECT 1, JSON_OBJECT('int', 100), + UNION ALL + SELECT 2, JSON_OBJECT('float', 0.98), + UNION ALL + SELECT 3, JSON_OBJECT('string', 'hello world'), + UNION ALL + SELECT 4, JSON_OBJECT('array', [8, 9, 10]), + UNION ALL + SELECT 5, JSON_OBJECT('null', null), + UNION ALL + SELECT 6, JSON_OBJECT('b', 2, 'a', 1), + UNION ALL + SELECT + 7, + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] + ) + ), + """ + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True) + + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + assert df["json_col"][0] == '{"boolean":true}' + assert df["json_col"][1] == '{"int":100}' + assert df["json_col"][2] == '{"float":0.98}' + assert df["json_col"][3] == '{"string":"hello world"}' + assert df["json_col"][4] == '{"array":[8,9,10]}' + assert df["json_col"][5] == '{"null":null}' + + # Verifies JSON strings preserve array order, regardless of dictionary key order. + assert df["json_col"][6] == '{"a":1,"b":2}' + assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' + + +def test_read_gbq_w_json_and_compare_w_pandas_json(session): + df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + result = df.to_pandas(allow_large_results=True) + + # These JSON strings are compatible with BigQuery's JSON storage, + pd_df = pd.DataFrame( + {"json_col": ['{"bar":true,"foo":10}']}, + dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), + ) + pd_df.index = pd_df.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + + +def test_read_gbq_w_json_in_struct(session): + """Avoid regressions for internal issue 381148539.""" + sql = """ + SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col + UNION ALL + SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2), + UNION ALL + SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3), + UNION ALL + SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4), + UNION ALL + SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5), + UNION ALL + SELECT 5, STRUCT(JSON_OBJECT('null', null), 6), + UNION ALL + SELECT + 6, + STRUCT(JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] + ) + ), 7), + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) + + data = df["struct_col"].struct.field("data") + assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + data = data.to_pandas(allow_large_results=True) + + assert data[0] == '{"boolean":true}' + assert data[1] == '{"int":100}' + assert data[2] == '{"float":0.98}' + assert data[3] == '{"string":"hello world"}' + assert data[4] == '{"array":[8,9,10]}' + assert data[5] == '{"null":null}' + assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' + + +def test_read_gbq_w_json_in_array(session): + sql = """ + SELECT + 0 AS id, + [ + JSON_OBJECT('boolean', True), + JSON_OBJECT('int', 100), + JSON_OBJECT('float', 0.98), + JSON_OBJECT('string', 'hello world'), + JSON_OBJECT('array', [8, 9, 10]), + JSON_OBJECT('null', null), + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ) + ] AS array_col, + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["array_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType) + + data = df["array_col"] + assert data.list.len()[0] == 7 + assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + pd_data = data.to_pandas(allow_large_results=True) + + assert pd_data[0] == [ + '{"boolean":true}', + '{"int":100}', + '{"float":0.98}', + '{"string":"hello world"}', + '{"array":[8,9,10]}', + '{"null":null}', + '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}', + ] + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/tests/unit/bigquery/__init__.py b/tests/unit/bigquery/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/bigquery/test_json.py b/tests/unit/bigquery/test_json.py new file mode 100644 index 0000000000..d9beea26db --- /dev/null +++ b/tests/unit/bigquery/test_json.py @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import pytest + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_json_set_w_invalid_json_path_value_pairs(): + mock_series = mock.create_autospec(bpd.pandas.Series, instance=True) + with pytest.raises(ValueError, match="Incorrect format"): + bbq.json_set(mock_series, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore From 499e00a0305fd63ebdedf30c06b775903951ac96 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 24 Mar 2025 09:51:53 -0700 Subject: [PATCH 15/39] chore: ensure udfs are actively deleted, make pre-commit mypy dep consistent (#1522) --- .pre-commit-config.yaml | 2 +- .../large/functions/test_managed_function.py | 28 +++++++++++-------- tests/system/utils.py | 18 +++++++++++- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d11c951a1..8ca120bd07 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,6 +38,6 @@ repos: rev: v1.10.0 hooks: - id: mypy - additional_dependencies: [types-requests, types-tabulate, pandas-stubs] + additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126] exclude: "^third_party" args: ["--check-untyped-defs", "--explicit-package-bases", "--ignore-missing-imports"] diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 47cbf7fb1b..7c8c74e005 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -72,7 +72,7 @@ def multiply(x, y): ) finally: # clean up the gcp assets created for the managed function. - cleanup_function_assets(multiply, bigquery_client) + cleanup_function_assets(multiply, bigquery_client, ignore_failures=False) def test_managed_function_stringify_with_ibis( @@ -118,7 +118,7 @@ def stringify(x): ) finally: # clean up the gcp assets created for the managed function. - cleanup_function_assets(stringify, bigquery_client) + cleanup_function_assets(stringify, bigquery_client, ignore_failures=False) @pytest.mark.parametrize( @@ -167,7 +167,7 @@ def featurize(x: int) -> list[array_dtype]: # type: ignore finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(featurize, session.bqclient) + cleanup_function_assets(featurize, session.bqclient, ignore_failures=False) @pytest.mark.parametrize( @@ -234,7 +234,7 @@ def foo(x: int) -> typ: # type:ignore pandas.testing.assert_frame_equal(bf_result_gbq, pd_result, check_dtype=False) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(foo, session.bqclient) + cleanup_function_assets(foo, session.bqclient, ignore_failures=False) @pytest.mark.parametrize( @@ -274,7 +274,7 @@ def foo_list(x: int) -> list[typ]: # type:ignore pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(foo_list, session.bqclient) + cleanup_function_assets(foo_list, session.bqclient, ignore_failures=False) def test_managed_function_series_combine(session, scalars_dfs): @@ -330,7 +330,9 @@ def add(x: int, y: int) -> int: pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(add_managed_func, session.bqclient) + cleanup_function_assets( + add_managed_func, session.bqclient, ignore_failures=False + ) def test_managed_function_series_combine_array_output(session, scalars_dfs): @@ -391,7 +393,9 @@ def add_list(x: int, y: int) -> list[int]: pandas.testing.assert_series_equal(bf_result_gbq, pd_result, check_dtype=False) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(add_list_managed_func, session.bqclient) + cleanup_function_assets( + add_list_managed_func, session.bqclient, ignore_failures=False + ) def test_managed_function_dataframe_map(session, scalars_dfs): @@ -425,7 +429,7 @@ def add_one(x): pandas.testing.assert_frame_equal(bf_result, pd_result) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(mf_add_one, session.bqclient) + cleanup_function_assets(mf_add_one, session.bqclient, ignore_failures=False) def test_managed_function_dataframe_map_array_output( @@ -464,7 +468,9 @@ def add_one_list(x): pandas.testing.assert_frame_equal(bf_result_gbq, pd_result, check_dtype=False) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(mf_add_one_list, session.bqclient) + cleanup_function_assets( + mf_add_one_list, session.bqclient, ignore_failures=False + ) def test_managed_function_dataframe_apply_axis_1(session, scalars_dfs): @@ -500,7 +506,7 @@ def add_ints(x, y): ) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(add_ints_mf, session.bqclient) + cleanup_function_assets(add_ints_mf, session.bqclient, ignore_failures=False) def test_managed_function_dataframe_apply_axis_1_array_output(session): @@ -605,4 +611,4 @@ def foo(x, y, z): finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(foo, session.bqclient) + cleanup_function_assets(foo, session.bqclient, ignore_failures=False) diff --git a/tests/system/utils.py b/tests/system/utils.py index bc1fe6745e..891d813935 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -401,7 +401,7 @@ def cleanup_function_assets( ) -> None: """Clean up the GCP assets behind a bigframess function.""" - # Clean up bigframes function. + # Clean up bigframes bigquery function. try: bigquery_client.delete_routine(bigframes_func.bigframes_bigquery_function) except Exception: @@ -409,6 +409,12 @@ def cleanup_function_assets( if not ignore_failures: raise + if not ignore_failures: + # Make sure that the BQ routins is actually deleted + with pytest.raises(google.api_core.exceptions.NotFound): + bigquery_client.get_routine(bigframes_func.bigframes_bigquery_function) + + # Clean up bigframes cloud run function if cloudfunctions_client: # Clean up cloud function try: @@ -420,6 +426,16 @@ def cleanup_function_assets( if not ignore_failures: raise + if not ignore_failures: + # Make sure the cloud run function is actually deleted + try: + gcf = cloudfunctions_client.get_function( + name=bigframes_func.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.DELETING + except google.cloud.exceptions.NotFound: + pass + def get_function_name(func, package_requirements=None, is_row_processor=False): """Get a bigframes function name for testing given a udf.""" From 578081e978f2cca21ddae8b3ee371972ba723777 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 24 Mar 2025 10:41:45 -0700 Subject: [PATCH 16/39] fix: read_pandas inline returns None when exceeds limit (#1525) --- bigframes/session/__init__.py | 42 ++++++++++++++---------------- tests/unit/session/test_session.py | 33 +++++++++++++++++++++++ 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index acaad4a5b7..dfee41c90b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -794,13 +794,14 @@ def _read_pandas( ) if write_engine == "default": - inline_df = self._read_pandas_inline(pandas_dataframe, should_raise=False) - if inline_df is not None: + try: + inline_df = self._read_pandas_inline(pandas_dataframe) return inline_df + except ValueError: + pass return self._read_pandas_load_job(pandas_dataframe, api_name) elif write_engine == "bigquery_inline": - # Regarding the type: ignore, with should_raise=True, this should never return None. - return self._read_pandas_inline(pandas_dataframe, should_raise=True) # type: ignore + return self._read_pandas_inline(pandas_dataframe) elif write_engine == "bigquery_load": return self._read_pandas_load_job(pandas_dataframe, api_name) elif write_engine == "bigquery_streaming": @@ -809,12 +810,16 @@ def _read_pandas( raise ValueError(f"Got unexpected write_engine '{write_engine}'") def _read_pandas_inline( - self, pandas_dataframe: pandas.DataFrame, should_raise=False - ) -> Optional[dataframe.DataFrame]: + self, pandas_dataframe: pandas.DataFrame + ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe - if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES: - return None + memory_usage = pandas_dataframe.memory_usage(deep=True).sum() + if memory_usage > MAX_INLINE_DF_BYTES: + raise ValueError( + f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed " + f"for inline data ({MAX_INLINE_DF_BYTES} bytes)." + ) try: local_block = blocks.Block.from_local(pandas_dataframe, self) @@ -825,29 +830,22 @@ def _read_pandas_inline( ValueError, # Thrown by ibis for some unhandled types TypeError, # Not all types handleable by local code path ) as exc: - if should_raise: - raise ValueError( - f"Could not convert with a BigQuery type: `{exc}`. " - ) from exc - else: - return None - - inline_types = inline_df._block.expr.schema.dtypes + raise ValueError( + f"Could not convert with a BigQuery type: `{exc}`. " + ) from exc # Make sure all types are inlinable to avoid escaping errors. + inline_types = inline_df._block.expr.schema.dtypes noninlinable_types = [ dtype for dtype in inline_types if dtype not in INLINABLE_DTYPES ] - if len(noninlinable_types) == 0: - return inline_df - - if should_raise: + if len(noninlinable_types) != 0: raise ValueError( f"Could not inline with a BigQuery type: `{noninlinable_types}`. " f"{constants.FEEDBACK_LINK}" ) - else: - return None + + return inline_df def _read_pandas_load_job( self, diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index d024d332d4..b35449f291 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -22,6 +22,8 @@ import google.api_core.exceptions import google.cloud.bigquery import google.cloud.bigquery.table +import pandas as pd +import pyarrow as pa import pytest import bigframes @@ -458,3 +460,34 @@ def today(cls): with pytest.warns(bigframes.exceptions.ObsoleteVersionWarning): resources.create_bigquery_session() + + +@mock.patch("bigframes.session.MAX_INLINE_DF_BYTES", 1) +def test_read_pandas_inline_exceeds_limit_raises_error(): + session = resources.create_bigquery_session() + pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises( + ValueError, + match=r"DataFrame size \(.* bytes\) exceeds the maximum allowed for inline data \(1 bytes\)\.", + ): + session.read_pandas(pd_df, write_engine="bigquery_inline") + + +def test_read_pandas_inline_w_interval_type_raises_error(): + session = resources.create_bigquery_session() + df = pd.DataFrame(pd.arrays.IntervalArray.from_breaks([0, 10, 20, 30, 40, 50])) + with pytest.raises(ValueError, match="Could not convert with a BigQuery type: "): + session.read_pandas(df, write_engine="bigquery_inline") + + +def test_read_pandas_inline_w_noninlineable_type_raises_error(): + session = resources.create_bigquery_session() + data = [ + [1, 2, 3], + [4, 5], + None, + [6, 7, 8, 9], + ] + s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + with pytest.raises(ValueError, match="Could not inline with a BigQuery type:"): + session.read_pandas(s, write_engine="bigquery_inline") From 9ff3fa8bfc933ff407f449b84e8352bdf2debc47 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 25 Mar 2025 07:47:33 -0700 Subject: [PATCH 17/39] chore: disable managed function tests temporarily (#1532) This is to give a breather to the BigQuery managed udf team to troubleshoot the quota issue. --- tests/system/large/blob/test_function.py | 6 ++++++ tests/system/large/functions/test_managed_function.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index dfdbbffede..588d9b3321 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -25,6 +25,12 @@ from bigframes import dtypes import bigframes.pandas as bpd +# TODO(shobs): restore these tests after the managed udf cleanup issue is +# resolved in the test project +pytestmark = pytest.mark.skip( + reason="temporarily disable to debug managed udf cleanup in the test project" +) + @pytest.fixture(scope="function") def images_output_folder() -> Generator[str, None, None]: diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 7c8c74e005..7001736bb2 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -21,6 +21,12 @@ import bigframes.pandas as bpd from tests.system.utils import cleanup_function_assets +# TODO(shobs): restore these tests after the managed udf cleanup issue is +# resolved in the test project +pytestmark = pytest.mark.skip( + reason="temporarily disable to debug managed udf cleanup in the test project" +) + bpd.options.experiments.udf = True From d553fa25fe85b3590269ed2ce08d5dff3bd22dfc Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Tue, 25 Mar 2025 11:25:17 -0500 Subject: [PATCH 18/39] docs: update `GeoSeries.difference()` and `bigframes.bigquery.st_difference()` docs (#1526) * docs: update GeoSeries.difference() and bigframes.bigquery.st_difference() docs * update variable names for readability --- bigframes/bigquery/_operations/geo.py | 26 ++++++------ .../bigframes_vendored/geopandas/geoseries.py | 41 ++++++++----------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index a41c33f67d..6501c84b6f 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -28,14 +28,14 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series: """ Returns the area in square meters covered by the polygons in the input - GEOGRAPHY. + `GEOGRAPHY`. If geography_expression is a point or a line, returns zero. If geography_expression is a collection, returns the area of the polygons in the collection; if the collection doesn't contain polygons, returns zero. - ..note:: + .. note:: BigQuery's Geography functions, like `st_area`, interpret the geometry data type as a point set on the Earth's surface. A point set is a set of points, lines, and polygons on the WGS84 reference spheroid, with @@ -98,14 +98,14 @@ def st_difference( series: bigframes.series.Series, other: bigframes.series.Series ) -> bigframes.series.Series: """ - Returns a GEOGRAPHY that represents the point set difference of + Returns a `GEOGRAPHY` that represents the point set difference of `geography_1` and `geography_2`. Therefore, the result consists of the part of `geography_1` that doesn't intersect with `geography_2`. - If `geometry_1` is completely contained in `geometry_2`, then ST_DIFFERENCE - returns an empty GEOGRAPHY. + If `geometry_1` is completely contained in `geometry_2`, then `ST_DIFFERENCE` + returns an empty `GEOGRAPHY`. - ..note:: + .. note:: BigQuery's Geography functions, like `st_difference`, interpret the geometry data type as a point set on the Earth's surface. A point set is a set of points, lines, and polygons on the WGS84 reference spheroid, with @@ -119,7 +119,7 @@ def st_difference( >>> from shapely.geometry import Polygon, LineString, Point >>> bpd.options.display.progress_bar = None - We can check two GeoSeries against each other, row by row. + We can check two GeoSeries against each other, row by row: >>> s1 = bigframes.geopandas.GeoSeries( ... [ @@ -168,32 +168,32 @@ def st_difference( We can also check difference of single shapely geometries: - >>> sbq1 = bigframes.geopandas.GeoSeries( + >>> polygon_s1 = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) ... ] ... ) - >>> sbq2 = bigframes.geopandas.GeoSeries( + >>> polygon_s2 = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) ... ] ... ) - >>> sbq1 + >>> polygon_s1 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) dtype: geometry - >>> sbq2 + >>> polygon_s2 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) dtype: geometry - >>> bbq.st_difference(sbq1, sbq2) + >>> bbq.st_difference(polygon_s1, polygon_s2) 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... dtype: geometry Additionally, we can check difference of a GeoSeries against a single shapely geometry: - >>> bbq.st_difference(s1, sbq2) + >>> bbq.st_difference(s1, polygon_s2) 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) 1 None 2 None diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index b00d4220ff..0d6b74671e 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -242,10 +242,10 @@ def to_wkt(self) -> bigframes.series.Series: def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore """ - Returns a GeoSeries of the points in each aligned geometry that are not - in other. + Returns a GeoSeries of the points in each aligned geometry that are not + in other. - The operation works on a 1-to-1 row-wise manner + The operation works on a 1-to-1 row-wise manner. **Examples:** @@ -254,7 +254,7 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore >>> from shapely.geometry import Polygon, LineString, Point >>> bpd.options.display.progress_bar = None - We can check two GeoSeries against each other, row by row. + We can check two GeoSeries against each other, row by row: >>> s1 = bigframes.geopandas.GeoSeries( ... [ @@ -303,32 +303,32 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore We can also check difference of single shapely geometries: - >>> sbq1 = bigframes.geopandas.GeoSeries( + >>> polygon_s1 = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) ... ] ... ) - >>> sbq2 = bigframes.geopandas.GeoSeries( + >>> polygon_s2 = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) ... ] ... ) - >>> sbq1 + >>> polygon_s1 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) dtype: geometry - >>> sbq2 + >>> polygon_s2 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) dtype: geometry - >>> sbq1.difference(sbq2) + >>> polygon_s1.difference(polygon_s2) 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... dtype: geometry Additionally, we can check difference of a GeoSeries against a single shapely geometry: - >>> s1.difference(sbq2) + >>> s1.difference(polygon_s2) 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) 1 None 2 None @@ -336,19 +336,14 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore 4 None dtype: geometry - Args: - other (GeoSeries or geometric object): - The GeoSeries (elementwise) or geometric object to find the - difference to. - - Returns: - bigframes.geopandas.GeoSeries: - A GeoSeries of the points in each aligned geometry that are not - in other. + Args: + other (bigframes.geopandas.GeoSeries or geometric object): + The GeoSeries (elementwise) or geometric object to find the + difference to. - Raises: - NotImplementedError: - GeoSeries.difference is not supported. Use - bigframes.bigquery.st_difference(series), instead. + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of the points in each aligned geometry that are not + in other. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 4b0cf572a921b66a3002ba269528223ddf2251a6 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 25 Mar 2025 10:54:58 -0700 Subject: [PATCH 19/39] chore: allow PRECEDING and FOLLOWING to appear on both side of BETWEEN when windowing (#1507) * chore: allow PRECEDING and FOLLOWING to appear on both side of BETWEEN when windowing * fix lint * Simplify the code by using the sign of the value for PRECEDING/FOLLOWING * fix lint * fix mypy * polish doc * remove float support for range rolling because Pandas does not support that --- bigframes/core/block_transforms.py | 6 +-- bigframes/core/compile/compiled.py | 47 ++++++++++++++------ bigframes/core/compile/polars/compiler.py | 33 +++++++------- bigframes/core/groupby/__init__.py | 8 ++-- bigframes/core/window_spec.py | 52 ++++++++++++++++------- bigframes/dataframe.py | 6 +-- bigframes/series.py | 6 +-- tests/unit/core/test_windowspec.py | 29 +++++++++++++ 8 files changed, 128 insertions(+), 59 deletions(-) create mode 100644 tests/unit/core/test_windowspec.py diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 0e9525d5af..09ef17dff5 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -213,8 +213,8 @@ def _interpolate_column( if interpolate_method not in ["linear", "nearest", "ffill"]: raise ValueError("interpolate method not supported") window_ordering = (ordering.OrderingExpression(ex.deref(x_values)),) - backwards_window = windows.rows(following=0, ordering=window_ordering) - forwards_window = windows.rows(preceding=0, ordering=window_ordering) + backwards_window = windows.rows(end=0, ordering=window_ordering) + forwards_window = windows.rows(start=0, ordering=window_ordering) # Note, this method may block, notnull = block.apply_unary_op(column, ops.notnull_op) @@ -450,7 +450,7 @@ def rank( ) if method == "dense" else windows.rows( - following=0, ordering=window_ordering, grouping_keys=grouping_cols + end=0, ordering=window_ordering, grouping_keys=grouping_cols ), skip_reproject_unsafe=(col != columns[-1]), ) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index c3d4c10267..d2fd7f3ea2 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,7 +21,9 @@ import bigframes_vendored.ibis import bigframes_vendored.ibis.backends.bigquery.backend as ibis_bigquery import bigframes_vendored.ibis.common.deferred as ibis_deferred # type: ignore +from bigframes_vendored.ibis.expr import builders as ibis_expr_builders import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +from bigframes_vendored.ibis.expr.operations import window as ibis_expr_window import bigframes_vendored.ibis.expr.operations as ibis_ops import bigframes_vendored.ibis.expr.types as ibis_types import pandas @@ -551,20 +553,9 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec): # Unbound grouping window. Suitable for aggregations but not for analytic function application. order_by = None - bounds = window_spec.bounds window = bigframes_vendored.ibis.window(order_by=order_by, group_by=group_by) - if bounds is not None: - if isinstance(bounds, RangeWindowBounds): - window = window.preceding_following( - bounds.preceding, bounds.following, how="range" - ) - if isinstance(bounds, RowsWindowBounds): - if bounds.preceding is not None or bounds.following is not None: - window = window.preceding_following( - bounds.preceding, bounds.following, how="rows" - ) - else: - raise ValueError(f"unrecognized window bounds {bounds}") + if window_spec.bounds is not None: + return _add_boundary(window_spec.bounds, window) return window @@ -681,3 +672,33 @@ def _as_groupable(value: ibis_types.Value): return scalar_op_compiler.to_json_string(value) else: return value + + +def _to_ibis_boundary( + boundary: Optional[int], +) -> Optional[ibis_expr_window.WindowBoundary]: + if boundary is None: + return None + return ibis_expr_window.WindowBoundary( + abs(boundary), preceding=boundary <= 0 # type:ignore + ) + + +def _add_boundary( + bounds: typing.Union[RowsWindowBounds, RangeWindowBounds], + ibis_window: ibis_expr_builders.LegacyWindowBuilder, +) -> ibis_expr_builders.LegacyWindowBuilder: + if isinstance(bounds, RangeWindowBounds): + return ibis_window.range( + start=_to_ibis_boundary(bounds.start), + end=_to_ibis_boundary(bounds.end), + ) + if isinstance(bounds, RowsWindowBounds): + if bounds.start is not None or bounds.end is not None: + return ibis_window.rows( + start=_to_ibis_boundary(bounds.start), + end=_to_ibis_boundary(bounds.end), + ) + return ibis_window + else: + raise ValueError(f"unrecognized window bounds {bounds}") diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 6d5b11a5e8..6fac3c9b92 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,9 +16,10 @@ import dataclasses import functools import itertools -from typing import cast, Sequence, Tuple, TYPE_CHECKING +from typing import cast, Optional, Sequence, Tuple, TYPE_CHECKING, Union import bigframes.core +from bigframes.core import window_spec import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.nodes as nodes @@ -366,23 +367,8 @@ def compile_window(self, node: nodes.WindowOpNode): indexed_df = df.with_row_index(index_col_name) if len(window.grouping_keys) == 0: # rolling-only window # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html - finite = ( - window.bounds.preceding is not None - and window.bounds.following is not None - ) - offset_n = ( - None - if window.bounds.preceding is None - else -window.bounds.preceding - ) - # collecting height is a massive kludge - period_n = ( - df.collect().height - if not finite - else cast(int, window.bounds.preceding) - + cast(int, window.bounds.following) - + 1 - ) + offset_n = window.bounds.start + period_n = _get_period(window.bounds) or df.collect().height results = indexed_df.rolling( index_column=index_col_name, period=f"{period_n}i", @@ -395,3 +381,14 @@ def compile_window(self, node: nodes.WindowOpNode): # polars is columnar, so this is efficient # TODO: why can't just add columns? return pl.concat([df, results], how="horizontal") + + +def _get_period( + bounds: Union[window_spec.RowsWindowBounds, window_spec.RangeWindowBounds] +) -> Optional[int]: + """Returns None if the boundary is infinite.""" + if bounds.start is None or bounds.end is None: + return None + + # collecting height is a massive kludge + return bounds.end - bounds.start + 1 diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 126d2f4dd2..3134df0daf 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -309,8 +309,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = window_specs.rows( grouping_keys=tuple(self._by_col_ids), - preceding=window - 1, - following=0, + start=-(window - 1), + end=0, min_periods=min_periods or window, ) block = self._block.order_by( @@ -742,8 +742,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = window_specs.rows( grouping_keys=tuple(self._by_col_ids), - preceding=window - 1, - following=0, + start=-(window - 1), + end=0, min_periods=min_periods or window, ) block = self._block.order_by( diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index b4a3d35471..a286234fc8 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -52,8 +52,8 @@ def unbound( ### Rows-based Windows def rows( grouping_keys: Tuple[str, ...] = (), - preceding: Optional[int] = None, - following: Optional[int] = None, + start: Optional[int] = None, + end: Optional[int] = None, min_periods: int = 0, ordering: Tuple[orderings.OrderingExpression, ...] = (), ) -> WindowSpec: @@ -63,10 +63,12 @@ def rows( Args: grouping_keys: Columns ids of grouping keys - preceding: - number of preceding rows to include. If None, include all preceding rows + start: + The window's starting boundary relative to the current row. For example, "-1" means one row prior + "1" means one row after, and "0" means the current row. If None, the window is unbounded from the start. following: - number of following rows to include. If None, include all following rows + The window's ending boundary relative to the current row. For example, "-1" means one row prior + "1" means one row after, and "0" means the current row. If None, the window is unbounded until the end. min_periods (int, default 0): Minimum number of input rows to generate output. ordering: @@ -74,7 +76,10 @@ def rows( Returns: WindowSpec """ - bounds = RowsWindowBounds(preceding=preceding, following=following) + bounds = RowsWindowBounds( + start=start, + end=end, + ) return WindowSpec( grouping_keys=tuple(map(ex.deref, grouping_keys)), bounds=bounds, @@ -97,7 +102,7 @@ def cumulative_rows( Returns: WindowSpec """ - bounds = RowsWindowBounds(following=0) + bounds = RowsWindowBounds(end=0) return WindowSpec( grouping_keys=tuple(map(ex.deref, grouping_keys)), bounds=bounds, @@ -119,7 +124,7 @@ def inverse_cumulative_rows( Returns: WindowSpec """ - bounds = RowsWindowBounds(preceding=0) + bounds = RowsWindowBounds(start=0) return WindowSpec( grouping_keys=tuple(map(ex.deref, grouping_keys)), bounds=bounds, @@ -132,18 +137,35 @@ def inverse_cumulative_rows( @dataclass(frozen=True) class RowsWindowBounds: - preceding: Optional[int] = None - following: Optional[int] = None - + start: Optional[int] = None + end: Optional[int] = None -# TODO: Expand to datetime offsets -OffsetType = Union[float, int] + def __post_init__(self): + if self.start is None: + return + if self.end is None: + return + if self.start > self.end: + raise ValueError( + f"Invalid window: start({self.start}) is greater than end({self.end})" + ) @dataclass(frozen=True) class RangeWindowBounds: - preceding: Optional[OffsetType] = None - following: Optional[OffsetType] = None + # TODO(b/388916840) Support range rolling on timeseries with timedeltas. + start: Optional[int] = None + end: Optional[int] = None + + def __post_init__(self): + if self.start is None: + return + if self.end is None: + return + if self.start > self.end: + raise ValueError( + f"Invalid window: start({self.start}) is greater than end({self.end})" + ) @dataclass(frozen=True) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1d3a45e879..9fd6ff5c7b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2428,12 +2428,12 @@ def replace( @validations.requires_ordering() def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: - window = windows.rows(preceding=limit, following=0) + window = windows.rows(start=None if limit is None else -limit, end=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) @validations.requires_ordering() def bfill(self, *, limit: typing.Optional[int] = None) -> DataFrame: - window = windows.rows(preceding=0, following=limit) + window = windows.rows(start=0, end=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def isin(self, values) -> DataFrame: @@ -3310,7 +3310,7 @@ def _perform_join_by_index( def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. window_def = windows.rows( - preceding=window - 1, following=0, min_periods=min_periods or window + start=-(window - 1), end=0, min_periods=min_periods or window ) return bigframes.core.window.Window( self._block, window_def, self._block.value_columns diff --git a/bigframes/series.py b/bigframes/series.py index 5f49daa07d..a33a3fca5c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -544,7 +544,7 @@ def cumsum(self) -> Series: @validations.requires_ordering() def ffill(self, *, limit: typing.Optional[int] = None) -> Series: - window = windows.rows(preceding=limit, following=0) + window = windows.rows(start=None if limit is None else -limit, end=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill @@ -552,7 +552,7 @@ def ffill(self, *, limit: typing.Optional[int] = None) -> Series: @validations.requires_ordering() def bfill(self, *, limit: typing.Optional[int] = None) -> Series: - window = windows.rows(preceding=0, following=limit) + window = windows.rows(start=0, end=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) @validations.requires_ordering() @@ -1441,7 +1441,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = windows.rows( - preceding=window - 1, following=0, min_periods=min_periods or window + start=-(window - 1), end=0, min_periods=min_periods or window ) return bigframes.core.window.Window( self._block, window_spec, self._block.value_columns, is_series=True diff --git a/tests/unit/core/test_windowspec.py b/tests/unit/core/test_windowspec.py new file mode 100644 index 0000000000..a630c87dda --- /dev/null +++ b/tests/unit/core/test_windowspec.py @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import window_spec + + +@pytest.mark.parametrize(("start", "end"), [(-1, -2), (1, -2), (2, 1)]) +def test_invalid_rows_window_boundary_raise_error(start, end): + with pytest.raises(ValueError): + window_spec.RowsWindowBounds(start, end) + + +@pytest.mark.parametrize(("start", "end"), [(-1, -2), (1, -2), (2, 1)]) +def test_invalid_range_window_boundary_raise_error(start, end): + with pytest.raises(ValueError): + window_spec.RangeWindowBounds(start, end) From 803c6dd2b35d69723ebeccce9feb2c56a12c680f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 25 Mar 2025 11:37:31 -0700 Subject: [PATCH 20/39] chore: include source columns in semantics.top_k (#1531) Co-authored-by: Shenyang Cai --- bigframes/operations/semantics.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index f4b9d85103..3b22f19539 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -807,13 +807,17 @@ def top_k( >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") - >>> df = bpd.DataFrame({"Animals": ["Dog", "Bird", "Cat", "Horse"]}) + >>> df = bpd.DataFrame( + ... { + ... "Animals": ["Dog", "Bird", "Cat", "Horse"], + ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], + ... }) >>> df.semantics.top_k("{Animals} are more popular as pets", model=model, k=2) - Animals - 0 Dog - 2 Cat + Animals Sounds + 0 Dog Woof + 2 Cat Meow - [2 rows x 1 columns] + [2 rows x 2 columns] Args: instruction (str): @@ -911,14 +915,8 @@ def top_k( ) num_selected += num_new_selected - df = ( - df[df[status_column] > 0] - .drop(["index", status_column], axis=1) - .rename(columns={"old_index": "index"}) - .set_index("index") - ) - df.index.name = None - return df + result_df: bigframes.dataframe.DataFrame = self._df.copy() + return result_df[df.set_index("old_index")[status_column] > 0.0] @staticmethod def _topk_partition( From ac59173f5cc10cf31d7545147008aff98603e499 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 25 Mar 2025 12:43:50 -0700 Subject: [PATCH 21/39] refactor: introduce ai operator namespace and deprecated semantics (#1511) * refactor: introduce ai operator namespace and deprecated semantics * duplicate semantic op options too * improve test coverage * relax test condition * clean up semantics and add public docs * addressing comments * use FutureWarning for deprecation * copy over recent top_k changes --- bigframes/_config/compute_options.py | 23 +- bigframes/_config/experiment_options.py | 18 +- bigframes/dataframe.py | 10 + bigframes/operations/ai.py | 896 ++++++++++++++++ bigframes/operations/semantics.py | 8 +- docs/reference/bigframes.pandas/frame.rst | 8 + docs/templates/toc.yml | 2 + tests/system/large/operations/test_ai.py | 956 ++++++++++++++++++ tests/system/small/operations/test_ai.py | 141 +++ .../system/small/operations/test_semantics.py | 19 + tests/unit/_config/test_experiment_options.py | 17 +- tests/unit/test_dataframe.py | 11 + 12 files changed, 2098 insertions(+), 11 deletions(-) create mode 100644 bigframes/operations/ai.py create mode 100644 tests/system/large/operations/test_ai.py create mode 100644 tests/system/small/operations/test_ai.py diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 21b41eb185..eb287f6065 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -60,19 +60,31 @@ class ComputeOptions: bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. + enable_multi_query_execution (bool, Options): If enabled, large queries may be factored into multiple smaller queries in order to avoid generating queries that are too complex for the query engine to handle. However this comes at the cost of increase cost and latency. + extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. - semmantic_ops_confirmation_threshold (int, optional): - Guards against unexepcted processing of large amount of rows by semantic operators. + + semantic_ops_confirmation_threshold (int, optional): + .. deprecated:: 1.42.0 + Semantic operators are deprecated. Please use AI operators instead + + semantic_ops_threshold_autofail (bool): + .. deprecated:: 1.42.0 + Semantic operators are deprecated. Please use AI operators instead + + ai_ops_confirmation_threshold (int, optional): + Guards against unexpected processing of large amount of rows by semantic operators. If the number of rows exceeds the threshold, the user will be asked to confirm their operations to resume. The default value is 0. Set the value to None to turn off the guard. - semantic_ops_threshold_autofail (bool): - Guards against unexepcted processing of large amount of rows by semantic operators. + + ai_ops_threshold_autofail (bool): + Guards against unexpected processing of large amount of rows by semantic operators. When set to True, the operation automatically fails without asking for user inputs. """ @@ -84,6 +96,9 @@ class ComputeOptions: semantic_ops_confirmation_threshold: Optional[int] = 0 semantic_ops_threshold_autofail = False + ai_ops_confirmation_threshold: Optional[int] = 0 + ai_ops_threshold_autofail = False + def assign_extra_query_labels(self, **kwargs: Any) -> None: """ Assigns additional custom labels for query configuration. The method updates the diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 3d52976004..abe465de50 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -24,6 +24,7 @@ class ExperimentOptions: def __init__(self): self._semantic_operators: bool = False + self._ai_operators: bool = False self._blob: bool = False self._udf: bool = False @@ -35,11 +36,24 @@ def semantic_operators(self) -> bool: def semantic_operators(self, value: bool): if value is True: msg = bfe.format_message( - "Semantic operators are still under experiments, and are subject " + "Semantic operators are deprecated, and will be removed in the future" + ) + warnings.warn(msg, category=FutureWarning) + self._semantic_operators = value + + @property + def ai_operators(self) -> bool: + return self._ai_operators + + @ai_operators.setter + def ai_operators(self, value: bool): + if value is True: + msg = bfe.format_message( + "AI operators are still under experiments, and are subject " "to change in the future." ) warnings.warn(msg, category=bfe.PreviewWarning) - self._semantic_operators = value + self._ai_operators = value @property def blob(self) -> bool: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9fd6ff5c7b..b0648d65ba 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -74,6 +74,7 @@ import bigframes.operations as ops import bigframes.operations.aggregations import bigframes.operations.aggregations as agg_ops +import bigframes.operations.ai import bigframes.operations.plotting as plotting import bigframes.operations.semantics import bigframes.operations.structs @@ -4574,4 +4575,13 @@ def _throw_if_null_index(self, opname: str): @property def semantics(self): + msg = bfe.format_message( + "The 'semantics' property will be removed. Please use 'ai' instead." + ) + warnings.warn(msg, category=FutureWarning) return bigframes.operations.semantics.Semantics(self) + + @property + def ai(self): + """Returns the accessor for AI operators.""" + return bigframes.operations.ai.AIAccessor(self) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py new file mode 100644 index 0000000000..0ff92187cf --- /dev/null +++ b/bigframes/operations/ai.py @@ -0,0 +1,896 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import typing +from typing import List, Optional +import warnings + +import numpy as np + +from bigframes import dtypes, exceptions +from bigframes.core import guid, log_adapter + + +@log_adapter.class_logger +class AIAccessor: + def __init__(self, df) -> None: + import bigframes # Import in the function body to avoid circular imports. + import bigframes.dataframe + + if not bigframes.options.experiments.ai_operators: + raise NotImplementedError() + + self._df: bigframes.dataframe.DataFrame = df + + def filter(self, instruction: str, model, ground_with_google_search: bool = False): + """ + Filters the DataFrame with the semantics of the user instruction. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) + >>> df.ai.filter("{city} is the capital of {country}", model) + country city + 1 Germany Berlin + + [1 rows x 2 columns] + + Args: + instruction (str): + An instruction on how to filter the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "food", you can refer to this column + in the instructions like: + "The {food} is healthy." + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.pandas.DataFrame: DataFrame filtered by the instruction. + + Raises: + NotImplementedError: when the AI operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + self._confirm_operation(len(self._df)) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + has_blob_column = False + for column in columns: + if df[column].dtype == dtypes.OBJ_REF_DTYPE: + # Don't cast blob columns to string + has_blob_column = True + continue + + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + user_instruction = self._format_instruction(instruction, columns) + output_instruction = "Based on the provided context, reply to the following claim by only True or False:" + + if has_blob_column: + results = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + df, + prompt=self._make_multimodel_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + else: + results = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + self._make_text_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + + return self._df[ + results["ml_generate_text_llm_result"].str.lower().str.contains("true") + ] + + def map( + self, + instruction: str, + output_column: str, + model, + ground_with_google_search: bool = False, + ): + """ + Maps the DataFrame with the semantics of the user instruction. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) + >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + ingredient_1 ingredient_2 food + 0 Burger Bun Beef Patty Burger + + 1 Soy Bean Bittern Tofu + + + [2 rows x 3 columns] + + Args: + instruction (str): + An instruction on how to map the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "food", you can refer to this column + in the instructions like: + "Get the ingredients of {food}." + + output_column (str): + The column name of the mapping result. + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.pandas.DataFrame: DataFrame with attached mapping results. + + Raises: + NotImplementedError: when the AI operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + self._confirm_operation(len(self._df)) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + has_blob_column = False + for column in columns: + if df[column].dtype == dtypes.OBJ_REF_DTYPE: + # Don't cast blob columns to string + has_blob_column = True + continue + + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + user_instruction = self._format_instruction(instruction, columns) + output_instruction = ( + "Based on the provided contenxt, answer the following instruction:" + ) + + if has_blob_column: + results = typing.cast( + bigframes.series.Series, + model.predict( + df, + prompt=self._make_multimodel_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + )["ml_generate_text_llm_result"], + ) + else: + results = typing.cast( + bigframes.series.Series, + model.predict( + self._make_text_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + )["ml_generate_text_llm_result"], + ) + + from bigframes.core.reshape.api import concat + + return concat([self._df, results.rename(output_column)], axis=1) + + def join( + self, + other, + instruction: str, + model, + ground_with_google_search: bool = False, + ): + """ + Joines two dataframes by applying the instruction over each pair of rows from + the left and right table. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) + >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) + + >>> cities.ai.join(continents, "{city} is in {continent}", model) + city continent + 0 Seattle North America + 1 Ottawa North America + 2 Shanghai Asia + 3 New Delhi Asia + + [4 rows x 2 columns] + + Args: + other (bigframes.pandas.DataFrame): + The other dataframe. + + instruction (str): + An instruction on how left and right rows can be joined. This value must contain + column references by name. which should be wrapped in a pair of braces. + For example: "The {city} belongs to the {country}". + For column names that are shared between two dataframes, you need to add "left." + and "right." prefix for differentiation. This is especially important when you do + self joins. For example: "The {left.employee_name} reports to {right.employee_name}" + For unique column names, this prefix is optional. + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + max_rows (int, default 1000): + The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method + call will end early with an error. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.pandas.DataFrame: The joined dataframe. + + Raises: + ValueError if the amount of data that will be sent for LLM processing is larger than max_rows. + """ + self._validate_model(model) + columns = self._parse_columns(instruction) + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) + + left_columns = [] + right_columns = [] + + for col in columns: + if col in self._df.columns and col in other.columns: + raise ValueError(f"Ambiguous column reference: {col}") + + elif col in self._df.columns: + left_columns.append(col) + + elif col in other.columns: + right_columns.append(col) + + elif col.startswith("left."): + original_col_name = col[len("left.") :] + if ( + original_col_name in self._df.columns + and original_col_name in other.columns + ): + left_columns.append(col) + elif original_col_name in self._df.columns: + left_columns.append(col) + instruction = instruction.replace(col, original_col_name) + else: + raise ValueError(f"Column {col} not found") + + elif col.startswith("right."): + original_col_name = col[len("right.") :] + if ( + original_col_name in self._df.columns + and original_col_name in other.columns + ): + right_columns.append(col) + elif original_col_name in other.columns: + right_columns.append(col) + instruction = instruction.replace(col, original_col_name) + else: + raise ValueError(f"Column {col} not found") + + else: + raise ValueError(f"Column {col} not found") + + if not left_columns: + raise ValueError("No left column references.") + + if not right_columns: + raise ValueError("No right column references.") + + # Update column references to be compatible with internal naming scheme. + # That is, "left.col" -> "col_left" and "right.col" -> "col_right" + instruction = re.sub(r"(?>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import bigframes + >>> bigframes.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + + >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) + >>> df.ai.search("creatures", "monkey", top_k=1, model=model, score_column='distance') + creatures distance + 3 chimpanzee 0.635844 + + [1 rows x 2 columns] + + Args: + search_column: + The name of the column to search from. + query (str): + The search query. + top_k (int): + The number of nearest neighbors to return. + model (TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + score_column (Optional[str], default None): + The name of the the additional column containning the similarity scores. If None, + this column won't be attached to the result. + + Returns: + DataFrame: the DataFrame with the search result. + + Raises: + ValueError: when the search_column is not found from the the data frame. + TypeError: when the provided model is not TextEmbeddingGenerator. + """ + + if search_column not in self._df.columns: + raise ValueError(f"Column `{search_column}` not found") + + self._confirm_operation(len(self._df)) + + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + if top_k < 1: + raise ValueError("top_k must be an integer greater than or equal to 1.") + + embedded_df = model.predict(self._df[search_column]) + embedded_table = embedded_df.reset_index().to_gbq() + + import bigframes.pandas as bpd + + embedding_result_column = "ml_generate_embedding_result" + query_df = model.predict(bpd.DataFrame({"query_id": [query]})).rename( + columns={"content": "query_id", embedding_result_column: "embedding"} + ) + + import bigframes.bigquery as bbq + + search_result = ( + bbq.vector_search( + base_table=embedded_table, + column_to_search=embedding_result_column, + query=query_df, + top_k=top_k, + ) + .rename(columns={"content": search_column}) + .set_index("index") + ) + + search_result.index.name = self._df.index.name + + if score_column is not None: + search_result = search_result.rename(columns={"distance": score_column})[ + [search_column, score_column] + ] + else: + search_result = search_result[[search_column]] + + import bigframes.dataframe + + return typing.cast(bigframes.dataframe.DataFrame, search_result) + + def top_k( + self, + instruction: str, + model, + k: int = 10, + ground_with_google_search: bool = False, + ): + """ + Ranks each tuple and returns the k best according to the instruction. + + This method employs a quick select algorithm to efficiently compare the pivot + with all other items. By leveraging an LLM (Large Language Model), it then + identifies the top 'k' best answers from these comparisons. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame( + ... { + ... "Animals": ["Dog", "Bird", "Cat", "Horse"], + ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], + ... }) + >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2) + Animals Sounds + 0 Dog Woof + 2 Cat Meow + + [2 rows x 2 columns] + + Args: + instruction (str): + An instruction on how to map the data. This value must contain + column references by name enclosed in braces. + For example, to reference a column named "Animals", use "{Animals}" in the + instruction, like: "{Animals} are more popular as pets" + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by the Bigframes ML package. + + k (int, default 10): + The number of rows to return. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. + + Raises: + NotImplementedError: when the AI operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + if len(columns) > 1: + raise NotImplementedError("AI top K are limited to a single column.") + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) + self._confirm_operation(work_estimate) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + column = columns[0] + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + # `index` is reserved for the `reset_index` below. + if column == "index": + raise ValueError( + "Column name 'index' is reserved. Please choose a different name." + ) + + if k < 1: + raise ValueError("k must be an integer greater than or equal to 1.") + + user_instruction = self._format_instruction(instruction, columns) + + n = df.shape[0] + if k >= n: + return df + + # Create a unique index and duplicate it as the "index" column. This workaround + # is needed for the select search algorithm due to unimplemented bigFrame methods. + df = df.reset_index().rename(columns={"index": "old_index"}).reset_index() + + # Initialize a status column to track the selection status of each item. + # - None: Unknown/not yet processed + # - 1.0: Selected as part of the top-k items + # - -1.0: Excluded from the top-k items + status_column = guid.generate_guid("status") + df[status_column] = bigframes.series.Series( + None, dtype=dtypes.FLOAT_DTYPE, session=df._session + ) + + num_selected = 0 + while num_selected < k: + df, num_new_selected = self._topk_partition( + df, + column, + status_column, + user_instruction, + model, + k - num_selected, + ground_with_google_search, + ) + num_selected += num_new_selected + + result_df: bigframes.dataframe.DataFrame = self._df.copy() + return result_df[df.set_index("old_index")[status_column] > 0.0] + + @staticmethod + def _topk_partition( + df, + column: str, + status_column: str, + user_instruction: str, + model, + k: int, + ground_with_google_search: bool, + ): + output_instruction = ( + "Given a question and two documents, choose the document that best answers " + "the question. Respond with 'Document 1' or 'Document 2'. You must choose " + "one, even if neither is ideal. " + ) + + # Random pivot selection for improved average quickselect performance. + pending_df = df[df[status_column].isna()] + pivot_iloc = np.random.randint(0, pending_df.shape[0]) + pivot_index = pending_df.iloc[pivot_iloc]["index"] + pivot_df = pending_df[pending_df["index"] == pivot_index] + + # Build a prompt to compare the pivot item's relevance to other pending items. + prompt_s = pending_df[pending_df["index"] != pivot_index][column] + prompt_s = ( + f"{output_instruction}\n\nQuestion: {user_instruction}\n" + + f"\nDocument 1: {column} " + + pivot_df.iloc[0][column] + + f"\nDocument 2: {column} " + + prompt_s # type:ignore + ) + + import bigframes.dataframe + + predict_df = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + prompt_s, + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + + marks = predict_df["ml_generate_text_llm_result"].str.contains("2") + more_relavant: bigframes.dataframe.DataFrame = df[marks] + less_relavent: bigframes.dataframe.DataFrame = df[~marks] + + num_more_relavant = more_relavant.shape[0] + if k < num_more_relavant: + less_relavent[status_column] = -1.0 + pivot_df[status_column] = -1.0 + df = df.combine_first(less_relavent).combine_first(pivot_df) + return df, 0 + else: # k >= num_more_relavant + more_relavant[status_column] = 1.0 + df = df.combine_first(more_relavant) + if k >= num_more_relavant + 1: + pivot_df[status_column] = 1.0 + df = df.combine_first(pivot_df) + return df, num_more_relavant + 1 + else: + return df, num_more_relavant + + def sim_join( + self, + other, + left_on: str, + right_on: str, + model, + top_k: int = 3, + score_column: Optional[str] = None, + max_rows: int = 1000, + ): + """ + Joins two dataframes based on the similarity of the specified columns. + + This method uses BigQuery's VECTOR_SEARCH function to match rows on the left side with the rows that have + nearest embedding vectors on the right. In the worst case scenario, the complexity is around O(M * N * log K). + Therefore, this is a potentially expensive operation. + + ** Examples: ** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + + >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) + >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) + + >>> df1.ai.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + animal animal_1 + 0 monkey baboon + 1 spider scorpion + + [2 rows x 2 columns] + + Args: + other (DataFrame): + The other data frame to join with. + left_on (str): + The name of the column on left side for the join. + right_on (str): + The name of the column on the right side for the join. + top_k (int, default 3): + The number of nearest neighbors to return. + model (TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + score_column (Optional[str], default None): + The name of the the additional column containning the similarity scores. If None, + this column won't be attached to the result. + max_rows: + The maximum number of rows allowed to be processed per call. If the result is too large, the method + call will end early with an error. + + Returns: + DataFrame: the data frame with the join result. + + Raises: + ValueError: when the amount of data to be processed exceeds the specified max_rows. + """ + + if left_on not in self._df.columns: + raise ValueError(f"Left column {left_on} not found") + if right_on not in self._df.columns: + raise ValueError(f"Right column {right_on} not found") + + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + joined_table_rows = len(self._df) * len(other) + if joined_table_rows > max_rows: + raise ValueError( + f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." + ) + + if top_k < 1: + raise ValueError("top_k must be an integer greater than or equal to 1.") + + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) + + base_table_embedding_column = guid.generate_guid() + base_table = self._attach_embedding( + other, right_on, base_table_embedding_column, model + ).to_gbq() + query_table = self._attach_embedding(self._df, left_on, "embedding", model) + + import bigframes.bigquery as bbq + + join_result = bbq.vector_search( + base_table=base_table, + column_to_search=base_table_embedding_column, + query=query_table, + top_k=top_k, + ) + + join_result = join_result.drop( + ["embedding", base_table_embedding_column], axis=1 + ) + + if score_column is not None: + join_result = join_result.rename(columns={"distance": score_column}) + else: + del join_result["distance"] + + return join_result + + @staticmethod + def _attach_embedding(dataframe, source_column: str, embedding_column: str, model): + result_df = dataframe.copy() + embeddings = model.predict(dataframe[source_column])[ + "ml_generate_embedding_result" + ] + result_df[embedding_column] = embeddings + return result_df + + @staticmethod + def _make_multimodel_prompt( + prompt_df, columns, user_instruction: str, output_instruction: str + ): + prompt = [f"{output_instruction}\n{user_instruction}\nContext: "] + for col in columns: + prompt.extend([f"{col} is ", prompt_df[col]]) + + return prompt + + @staticmethod + def _make_text_prompt( + prompt_df, columns, user_instruction: str, output_instruction: str + ): + prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: " + + # Combine context from multiple columns. + for col in columns: + prompt_df["prompt"] += f"{col} is `" + prompt_df[col] + "`\n" + + return prompt_df["prompt"] + + @staticmethod + def _parse_columns(instruction: str) -> List[str]: + """Extracts column names enclosed in curly braces from the user instruction. + For example, _parse_columns("{city} is in {continent}") == ["city", "continent"] + """ + columns = re.findall(r"(? str: + """Extracts column names enclosed in curly braces from the user instruction. + For example, `_format_instruction(["city", "continent"], "{city} is in {continent}") + == "city is in continent"` + """ + return instruction.format(**{col: col for col in columns}) + + @staticmethod + def _validate_model(model): + from bigframes.ml.llm import GeminiTextGenerator + + if not isinstance(model, GeminiTextGenerator): + raise TypeError("Model is not GeminiText Generator") + + @staticmethod + def _confirm_operation(row_count: int): + """Raises OperationAbortedError when the confirmation fails""" + import bigframes # Import in the function body to avoid circular imports. + + threshold = bigframes.options.compute.ai_ops_confirmation_threshold + + if threshold is None or row_count <= threshold: + return + + if bigframes.options.compute.ai_ops_threshold_autofail: + raise exceptions.OperationAbortedError( + f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows." + ) + + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. + print(f"This operation will process about {row_count} rows.") + print( + "You can raise the confirmation threshold by setting `bigframes.options.compute.ai_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." + ) + print("Proceed? [Y/n]") + reply = input().casefold() + if reply not in {"y", "yes", ""}: + raise exceptions.OperationAbortedError("Operation was cancelled.") diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 3b22f19539..d1089f993e 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -857,9 +857,7 @@ def top_k( if column not in self._df.columns: raise ValueError(f"Column {column} not found.") if len(columns) > 1: - raise NotImplementedError( - "Semantic aggregations are limited to a single column." - ) + raise NotImplementedError("Semantic top K are limited to a single column.") if ground_with_google_search: msg = exceptions.format_message( @@ -900,7 +898,9 @@ def top_k( # - 1.0: Selected as part of the top-k items # - -1.0: Excluded from the top-k items status_column = guid.generate_guid("status") - df[status_column] = bigframes.series.Series(None, dtype=dtypes.FLOAT_DTYPE) + df[status_column] = bigframes.series.Series( + None, dtype=dtypes.FLOAT_DTYPE, session=df._session + ) num_selected = 0 while num_selected < k: diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index bc9f714416..4e231bd821 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -34,3 +34,11 @@ Struct handling :members: :inherited-members: :undoc-members: + +AI operators +^^^^^^^^^^^^ + +.. autoclass:: bigframes.operations.ai.AIAccessor + :members: + :inherited-members: + :undoc-members: \ No newline at end of file diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b4f513b11d..b00044b087 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -45,6 +45,8 @@ uid: bigframes.operations.plotting.PlotAccessor - name: StructAccessor uid: bigframes.operations.structs.StructFrameAccessor + - name: AI + uid: bigframes.operations.ai.AIAccessor - items: - name: DataFrameGroupBy uid: bigframes.core.groupby.DataFrameGroupBy diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py new file mode 100644 index 0000000000..04074a2ea6 --- /dev/null +++ b/tests/system/large/operations/test_ai.py @@ -0,0 +1,956 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from contextlib import nullcontext +from unittest.mock import patch + +import pandas as pd +import pandas.testing +import pytest + +import bigframes +from bigframes import dataframe, exceptions, series + +AI_OP_EXP_OPTION = "experiments.ai_operators" +BLOB_EXP_OPTION = "experiments.blob" +THRESHOLD_OPTION = "compute.ai_ops_confirmation_threshold" + + +def test_ai_experiment_off_raise_error(): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with bigframes.option_context(AI_OP_EXP_OPTION, False), pytest.raises( + NotImplementedError + ): + df.ai + + +def test_filter(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ).to_pandas() + + expected_df = pd.DataFrame( + {"country": ["Germany"], "city": ["Berlin"], "year": [2024]}, index=[1] + ) + pandas.testing.assert_frame_equal( + actual_df, expected_df, check_dtype=False, check_index_type=False + ) + + +def test_filter_multi_model(session, gemini_flash_model): + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + BLOB_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + df["prey"] = series.Series( + ["building", "cross road", "rock", "squirrel", "rabbit"], session=session + ) + result = df.ai.filter( + "The object in {image} feeds on {prey}", + gemini_flash_model, + ).to_pandas() + + assert len(result) <= len(df) + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.ai.filter("{city} is the capital of {country} in {year}", gemini_flash_model) + + +def test_filter_single_column_reference(session, gemini_flash_model): + df = dataframe.DataFrame( + data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{country} is in Europe", gemini_flash_model + ).to_pandas() + + expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) + pandas.testing.assert_frame_equal( + actual_df, expected_df, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{city} is in the {non_existing_column}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{id}", + id="invalid_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): + df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.filter(instruction, gemini_flash_model) + + +def test_filter_invalid_model_raise_error(): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df.ai.filter("{city} is the capital of {country}", None) + + +def test_map(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ).to_pandas() + # Result sanitation + actual_df["food"] = actual_df["food"].str.strip().str.lower() + + expected_df = pd.DataFrame( + { + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + "food": ["burger", "tofu"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +def test_map_multimodel(session, gemini_flash_model): + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + BLOB_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + df["scenario"] = series.Series( + ["building", "cross road", "tree", "squirrel", "rabbit"], session=session + ) + result = df.ai.map( + "What is the object in {image} combined with {scenario}? One word only.", + "object", + gemini_flash_model, + ).to_pandas() + + assert len(result) == len(df) + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "What is the food made from {ingredient_1} and {non_existing_column}?}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{id}", + id="invalid_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "id": [1, 2], + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + } + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.map(instruction, "food", gemini_flash_model) + + +def test_map_invalid_model_raise_error(): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + }, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df.ai.map( + "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + None, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param("{city} is in {country}", id="no_dataframe_reference"), + pytest.param("{left.city} is in {country}", id="has_left_dataframe_reference"), + pytest.param( + "{city} is in {right.country}", + id="has_right_dataframe_reference", + ), + pytest.param( + "{left.city} is in {right.country}", id="has_both_dataframe_references" + ), + ], +) +def test_join(instruction, session, gemini_flash_model): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.ai.join( + countries, + instruction, + gemini_flash_model, + ).to_pandas() + + expected_df = pd.DataFrame( + { + "city": ["Seattle", "Berlin"], + "country": ["USA", "Germany"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + cities.ai.join( + countries, + "{city} is in {country}", + gemini_flash_model, + ) + + +def test_self_join(session, gemini_flash_model): + animals = dataframe.DataFrame( + data={ + "animal": ["spider", "capybara"], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = animals.ai.join( + animals, + "{left.animal} is heavier than {right.animal}", + gemini_flash_model, + ).to_pandas() + + expected_df = pd.DataFrame( + { + "animal_left": ["capybara"], + "animal_right": ["spider"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + ("instruction", "error_pattern"), + [ + ("No column reference", "No column references"), + pytest.param( + "{city} is in {continent}", r"Column .+ not found", id="non_existing_column" + ), + pytest.param( + "{city} is in {country}", + r"Ambiguous column reference: .+", + id="ambiguous_column", + ), + pytest.param( + "{right.city} is in {country}", r"Column .+ not found", id="wrong_prefix" + ), + pytest.param( + "{city} is in {right.continent}", + r"Column .+ not found", + id="prefix_on_non_existing_column", + ), + ], +) +def test_join_invalid_instruction_raise_error( + instruction, error_pattern, gemini_flash_model +): + df1 = dataframe.DataFrame( + {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} + ) + df2 = dataframe.DataFrame( + { + "country": ["USA", "UK", "Germany"], + "region": ["North America", "Europe", "Europe"], + } + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError, match=error_pattern): + df1.ai.join(df2, instruction, gemini_flash_model) + + +def test_join_invalid_model_raise_error(): + cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) + countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + cities.ai.join(countries, "{city} is in {country}", None) + + +@pytest.mark.parametrize( + "score_column", + [ + pytest.param(None, id="no_score_column"), + pytest.param("distance", id="has_score_column"), + ], +) +def test_search(session, text_embedding_generator, score_column): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df.ai.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + score_column=score_column, + ).to_pandas() + + expected_result = pd.Series( + ["baboons", "chimpanzee"], index=[2, 4], name="creatures" + ) + pandas.testing.assert_series_equal( + actual_result["creatures"], + expected_result, + check_dtype=False, + check_index_type=False, + ) + + if score_column is None: + assert len(actual_result.columns) == 1 + else: + assert score_column in actual_result.columns + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_search_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.ai.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + ) + + +def test_search_invalid_column_raises_error(session, text_embedding_generator): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.search("whatever", "monkey", top_k=2, model=text_embedding_generator) + + +def test_search_invalid_model_raises_error(session): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df.ai.search("creatures", "monkey", top_k=2, model=None) + + +def test_search_invalid_top_k_raises_error(session, text_embedding_generator): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.search("creatures", "monkey", top_k=0, model=text_embedding_generator) + + +@pytest.mark.parametrize( + "score_column", + [ + pytest.param(None, id="no_score_column"), + pytest.param("distance", id="has_score_column"), + ], +) +def test_sim_join(session, text_embedding_generator, score_column): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + score_column=score_column, + ).to_pandas() + + expected_result = pd.DataFrame( + {"creatures": ["salmon", "cat"], "creatures_1": ["tuna", "dog"]} + ) + pandas.testing.assert_frame_equal( + actual_result[["creatures", "creatures_1"]], + expected_result, + check_dtype=False, + check_index_type=False, + ) + + if score_column is None: + assert len(actual_result.columns) == 2 + else: + assert score_column in actual_result.columns + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_sim_join_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + ) + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + pytest.param("whatever", "creatures", id="incorrect_left_column"), + pytest.param("creatures", "whatever", id="incorrect_right_column"), + ], +) +def test_sim_join_invalid_column_raises_error( + session, text_embedding_generator, left_on, right_on +): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df1.ai.sim_join( + df2, left_on=left_on, right_on=right_on, model=text_embedding_generator + ) + + +def test_sim_join_invalid_model_raises_error(session): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df1.ai.sim_join(df2, left_on="creatures", right_on="creatures", model=None) + + +def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + top_k=0, + model=text_embedding_generator, + ) + + +def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + max_rows=1, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{Animals}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{Animals} and {Animals}", + id="two_columns", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + "{index}", + id="preserved", + marks=pytest.mark.xfail(raises=ValueError), + ), + ], +) +def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): + df = dataframe.DataFrame( + { + "Animals": ["Dog", "Cat", "Bird", "Horse"], + "ID": [1, 2, 3, 4], + "index": ["a", "b", "c", "d"], + } + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.ai.top_k(instruction, model=gemini_flash_model, k=2) + + +def test_top_k_invalid_k_raise_error(gemini_flash_model): + df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.top_k( + "{Animals} are more popular as pets", + gemini_flash_model, + k=0, + ) + + +@patch("builtins.input", return_value="") +def test_confirm_operation__below_threshold_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 3, + ): + df.ai._confirm_operation(1) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + None, + ): + df.ai._confirm_operation(100) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 1, + "compute.ai_ops_threshold_autofail", + True, + ), pytest.raises(exceptions.OperationAbortedError): + df.ai._confirm_operation(100) + + mock_input.assert_not_called() + + +@pytest.mark.parametrize( + ("reply", "expectation"), + [ + ("y", nullcontext()), + ("yes", nullcontext()), + ("", nullcontext()), + ("n", pytest.raises(exceptions.OperationAbortedError)), + ("something", pytest.raises(exceptions.OperationAbortedError)), + ], +) +def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypatch): + monkeypatch.setattr("builtins.input", lambda: reply) + df = dataframe.DataFrame({}) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 3, + ), expectation as e: + assert df.ai._confirm_operation(4) == e diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py new file mode 100644 index 0000000000..de6ba4b86c --- /dev/null +++ b/tests/system/small/operations/test_ai.py @@ -0,0 +1,141 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Note that the tests in this files uses fake models for deterministic results. +# Tests that use real LLM models are under system/large/test_ai.py + +import pandas as pd +import pandas.testing +import pytest + +import bigframes +from bigframes import dataframe, dtypes +from bigframes.ml import llm + +AI_OP_EXP_OPTION = "experiments.ai_operators" +THRESHOLD_OPTION = "compute.ai_ops_confirmation_threshold" + + +class FakeGeminiTextGenerator(llm.GeminiTextGenerator): + def __init__(self, prediction): + self.prediction = prediction + + def predict(self, *args, **kwargs): + return self.prediction + + +def test_experiment_off_raise_error(session): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session + ) + + with bigframes.option_context(AI_OP_EXP_OPTION, False), pytest.raises( + NotImplementedError + ): + df.ai + + +def test_filter(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["true", "false"]}, session=session + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.filter( + "filter {col}", + model=model, + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame({"col": ["A"]}, dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) + + +def test_map(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["true", "false"]}, session=session + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.map("map {col}", model=model, output_column="output").to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + {"col": ["A", "B"], "output": ["true", "false"]}, dtype=dtypes.STRING_DTYPE + ), + check_index_type=False, + ) + + +def test_join(session): + left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) + right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame({"ml_generate_text_llm_result": ["true"]}, session=session), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = left_df.ai.join( + right_df, "join {col_A} and {col_B}", model + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame({"col_A": ["A"], "col_B": ["B"]}, dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) + + +def test_top_k(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["Document 1"]}, session=session + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() + + assert len(result) == 1 diff --git a/tests/system/small/operations/test_semantics.py b/tests/system/small/operations/test_semantics.py index 85777faaf6..8b520d8c03 100644 --- a/tests/system/small/operations/test_semantics.py +++ b/tests/system/small/operations/test_semantics.py @@ -122,3 +122,22 @@ def test_join(session): pd.DataFrame({"col_A": ["A"], "col_B": ["B"]}, dtype=dtypes.STRING_DTYPE), check_index_type=False, ) + + +def test_top_k(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["Document 1"]}, session=session + ), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.semantics.top_k("top k of {col}", model, k=1).to_pandas() + + assert len(result) == 1 diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py index 9735e494be..ce1dd0f146 100644 --- a/tests/unit/_config/test_experiment_options.py +++ b/tests/unit/_config/test_experiment_options.py @@ -27,12 +27,27 @@ def test_semantic_operators_default_false(): def test_semantic_operators_set_true_shows_warning(): options = experiment_options.ExperimentOptions() - with pytest.warns(bfe.PreviewWarning): + with pytest.warns(FutureWarning): options.semantic_operators = True assert options.semantic_operators is True +def test_ai_operators_default_false(): + options = experiment_options.ExperimentOptions() + + assert options.ai_operators is False + + +def test_ai_operators_set_true_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(bfe.PreviewWarning): + options.ai_operators = True + + assert options.ai_operators is True + + def test_blob_default_false(): options = experiment_options.ExperimentOptions() diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index a6ad5e3821..8d1e77510a 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -89,3 +89,14 @@ def test_dataframe_to_gbq_writes_to_anonymous_dataset( destination = dataframe.to_gbq() assert destination.startswith(anonymous_dataset_id) + + +def test_dataframe_semantics_property_future_warning( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with bigframes.option_context("experiments.semantic_operators", True), pytest.warns( + FutureWarning + ): + dataframe.semantics From 7e5b6a873d00162ffca3d254d3af276c5f06d866 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Tue, 25 Mar 2025 14:43:58 -0500 Subject: [PATCH 22/39] feat: add Linear_Regression.global_explain() (#1446) * feat: add Linear_Regression.global_explain() * remove class_level_explain param * working global_explain() * begin adding tests * update snippet * complete snippet * failing, near complete linear model test * passing system test * Update core.py - set index to have sorted by feature * Update test_linear_model.py - remove set/set index * Update linear_model.py - fix doc section * Update conftest.py - rename penguins w global explain * Update linear_model.py - complete doc * lint * passing test and fixed expected results --- bigframes/ml/core.py | 10 ++++++ bigframes/ml/linear_model.py | 20 +++++++++++ bigframes/ml/sql.py | 6 ++++ .../linear_regression_tutorial_test.py | 25 +++++++++++++ tests/system/small/ml/conftest.py | 9 +++++ tests/system/small/ml/test_linear_model.py | 36 +++++++++++++++++++ 6 files changed, 106 insertions(+) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index ad00ed3f2c..01917fd6d8 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -134,6 +134,16 @@ def explain_predict( ), ) + def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame: + sql = self._model_manipulation_sql_generator.ml_global_explain( + struct_options=options + ) + return ( + self._session.read_gbq(sql) + .sort_values(by="attribution", ascending=False) + .set_index("feature") + ) + def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 46c5744a42..3774a62c0c 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -203,6 +203,26 @@ def predict_explain( X, options={"top_k_features": top_k_features} ) + def global_explain( + self, + ) -> bpd.DataFrame: + """ + Provide explanations for an entire linear regression model. + + .. note:: + Output matches that of the BigQuery ML.GLOBAL_EXPLAIN function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-global-explain + + Returns: + bigframes.pandas.DataFrame: + Dataframes containing feature importance values and corresponding attributions, designed to provide a global explanation of feature influence. + """ + + if not self._bqml_model: + raise RuntimeError("A model must be fitted before predict") + + return self._bqml_model.global_explain({}) + def score( self, X: utils.ArrayType, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index b662d4c22c..e89f17bcaa 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -312,6 +312,12 @@ def ml_explain_predict( return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}), {struct_options_sql})""" + def ml_global_explain(self, struct_options) -> str: + """Encode ML.GLOBAL_EXPLAIN for BQML""" + struct_options_sql = self.struct_options(**struct_options) + return f"""SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {self._model_ref_sql()}, + {struct_options_sql})""" + def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" struct_options_sql = self.struct_options(**struct_options) diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index e4ace53a5c..8fc1c5ad61 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -92,6 +92,31 @@ def test_linear_regression(random_model_id: str) -> None: # 3 5349.603734 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 5349.603734 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 15.6 221.0 5000.0 MALE # 4 4637.165037 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4637.165037 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 13.2 211.0 4500.0 FEMALE # [END bigquery_dataframes_bqml_linear_predict_explain] + # [START bigquery_dataframes_bqml_linear_global_explain] + # To use the `global_explain()` function, the model must be recreated with `enable_global_explain` set to `True`. + model = LinearRegression(enable_global_explain=True) + + # The model must the be fitted before it can be saved to BigQuery and then explained. + training_data = bq_df.dropna(subset=["body_mass_g"]) + X = training_data.drop(columns=["body_mass_g"]) + y = training_data[["body_mass_g"]] + model.fit(X, y) + model.to_gbq("bqml_tutorial.penguins_model", replace=True) + + # Explain the model + explain_model = model.global_explain() + + # Expected results: + # attribution + # feature + # island 5737.315921 + # species 4073.280549 + # sex 622.070896 + # flipper_length_mm 193.612051 + # culmen_depth_mm 117.084944 + # culmen_length_mm 94.366793 + # [END bigquery_dataframes_bqml_linear_global_explain] + assert explain_model is not None assert feature_columns is not None assert label_columns is not None assert model is not None diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 0e8489c513..2b9392f523 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -84,6 +84,15 @@ def ephemera_penguins_linear_model( return bf_model +@pytest.fixture(scope="function") +def penguins_linear_model_w_global_explain( + penguins_bqml_linear_model: core.BqmlModel, +) -> linear_model.LinearRegression: + bf_model = linear_model.LinearRegression(enable_global_explain=True) + bf_model._bqml_model = penguins_bqml_linear_model + return bf_model + + @pytest.fixture(scope="session") def penguins_logistic_model( session, penguins_logistic_model_name diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index da9fc8e14f..8b04d55e61 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -228,6 +228,42 @@ def test_to_gbq_saved_linear_reg_model_scores( ) +def test_linear_reg_model_global_explain( + penguins_linear_model_w_global_explain, new_penguins_df +): + training_data = new_penguins_df.dropna(subset=["body_mass_g"]) + X = training_data.drop(columns=["body_mass_g"]) + y = training_data[["body_mass_g"]] + penguins_linear_model_w_global_explain.fit(X, y) + global_ex = penguins_linear_model_w_global_explain.global_explain() + assert global_ex.shape == (6, 1) + expected_columns = pandas.Index(["attribution"]) + pandas.testing.assert_index_equal(global_ex.columns, expected_columns) + result = global_ex.to_pandas().drop(["attribution"], axis=1).sort_index() + expected_feature = ( + pandas.DataFrame( + { + "feature": [ + "island", + "species", + "sex", + "flipper_length_mm", + "culmen_depth_mm", + "culmen_length_mm", + ] + }, + ) + .set_index("feature") + .sort_index() + ) + pandas.testing.assert_frame_equal( + result, + expected_feature, + check_exact=False, + check_index_type=False, + ) + + def test_to_gbq_replace(penguins_linear_model, table_id_unique): penguins_linear_model.to_gbq(table_id_unique, replace=True) with pytest.raises(google.api_core.exceptions.Conflict): From 222af752c21937a673b0471194e09f9607ad1d08 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 25 Mar 2025 13:06:52 -0700 Subject: [PATCH 23/39] chore: set up release-please for previous major version (1.x) (#1535) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: configure CI for 1.x branch (#1528) * chore: set up release-please for previous major version (1.x) * revert owlbot.py change as it already excludes release-please.yml * chore: set up release-please for previous major version (1.x) * revert owlbot.py change as it already excludes release-please.yml * Revert "chore: configure CI for 1.x branch (#1528)" This reverts commit 54e63ea395f49a29d8f5c50a0e8fc5c08194e996. * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .github/release-please.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/release-please.yml b/.github/release-please.yml index 8c34d1b49f..7c2b8d9e8a 100644 --- a/.github/release-please.yml +++ b/.github/release-please.yml @@ -3,3 +3,8 @@ handleGHRelease: true extraFiles: - bigframes/version.py - third_party/bigframes_vendored/version.py + +branches: + - branch: v1 + handleGHRelease: true + releaseType: python From bdcb1e7929dc2f24c642ddb052629da394f45876 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 25 Mar 2025 13:35:44 -0700 Subject: [PATCH 24/39] feat: Support window partition by geo column (#1512) --- bigframes/core/compile/compiled.py | 6 ++++-- tests/system/small/geopandas/test_geoseries.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d2fd7f3ea2..4443c495d7 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -665,9 +665,11 @@ def _join_condition( def _as_groupable(value: ibis_types.Value): - # Some types need to be converted to string to enable groupby - if value.type().is_float64() or value.type().is_geospatial(): + # Some types need to be converted to another type to enable groupby + if value.type().is_float64(): return value.cast(ibis_dtypes.str) + elif value.type().is_geospatial(): + return typing.cast(ibis_types.GeoSpatialColumn, value).as_binary() elif value.type().is_json(): return scalar_op_compiler.to_json_string(value) else: diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index fdd9826468..fb101dea89 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -289,3 +289,20 @@ def test_geo_difference_with_similar_geometry_objects(): assert expected.iloc[0].equals(bf_result.iloc[0]) assert expected.iloc[1].equals(bf_result.iloc[1]) assert expected.iloc[2].equals(bf_result.iloc[2]) + + +def test_geo_drop_duplicates(): + bf_series = bigframes.geopandas.GeoSeries( + [Point(1, 1), Point(2, 2), Point(3, 3), Point(2, 2)] + ) + + pd_series = geopandas.GeoSeries( + [Point(1, 1), Point(2, 2), Point(3, 3), Point(2, 2)] + ) + + bf_result = bf_series.drop_duplicates().to_pandas() + pd_result = pd_series.drop_duplicates() + + pd.testing.assert_series_equal( + geopandas.GeoSeries(bf_result), pd_result, check_index=False + ) From c93e7204758435b0306699d3a1332aaf522f576b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 25 Mar 2025 14:37:55 -0700 Subject: [PATCH 25/39] fix: add deprecation warning to TextEmbeddingGenerator model, espeically gemini-1.0-X and gemini-1.5-X (#1534) --- bigframes/ml/llm.py | 8 + .../apps/synthetic_data_generation.ipynb | 849 +------------ .../bq_dataframes_llm_code_generation.ipynb | 2 +- .../bq_dataframes_llm_kmeans.ipynb | 2 +- .../bq_dataframes_llm_vector_search.ipynb | 1124 +++++++++-------- ...q_dataframes_ml_drug_name_generation.ipynb | 2 +- .../generative_ai/large_language_models.ipynb | 81 +- .../bq_dataframes_template.ipynb | 2 +- samples/snippets/gemini_model_test.py | 4 +- tests/system/load/test_llm.py | 3 +- tests/system/small/ml/conftest.py | 17 + tests/system/small/ml/test_llm.py | 56 +- 12 files changed, 715 insertions(+), 1435 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 0117444f16..1fd9fbc4a7 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -918,15 +918,23 @@ def to_gbq( return new_model.session.read_gbq_model(model_name) +@typing_extensions.deprecated( + "gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", + category=exceptions.ApiDeprecationWarning, +) @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. + .. note:: + gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. + Args: model_name (str, Default to "gemini-pro"): The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002" and "gemini-2.0-flash-exp". Default to "gemini-pro". .. note:: + "gemini-pro" is going to be deprecated. Bigframes 2 will transition to using gemini-2.0-X. "gemini-2.0-flash-exp", "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb index a6e8444aac..c190f219af 100644 --- a/notebooks/apps/synthetic_data_generation.ipynb +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -52,12 +52,12 @@ "output_type": "stream", "text": [ "Collecting faker\n", - " Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.4 in /usr/local/lib/python3.10/dist-packages (from faker) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.4->faker) (1.16.0)\n", - "Installing collected packages: faker\n", - "Successfully installed faker-24.9.0\n" + " Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)\n", + "Requirement already satisfied: tzdata in /usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages (from faker) (2024.2)\n", + "Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m55.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: faker\n", + "Successfully installed faker-37.1.0\n" ] } ], @@ -67,11 +67,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "id": "m3q1oeJALhsG" }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'PROJECT_ID' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mbigframes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mbpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m bpd\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mbigquery\u001b[38;5;241m.\u001b[39mproject \u001b[38;5;241m=\u001b[39m \u001b[43mPROJECT_ID\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'PROJECT_ID' is not defined" + ] + } + ], "source": [ "import bigframes.pandas as bpd\n", "bpd.options.bigquery.project = PROJECT_ID" @@ -95,32 +107,11 @@ "id": "lIYdn1woOS1n", "outputId": "be474338-44c2-4ce0-955e-d525b8b9c84b" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/bigframes/session/__init__.py:1907: UserWarning: No explicit location is set, so using location US for the session.\n", - " return Session(context)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 3e8423da-737c-42e2-a3d2-d2180ca18579 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator()" + "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" ] }, { @@ -141,77 +132,7 @@ "id": "SSR-lLScLa95", "outputId": "cbaec34e-6fa6-45b4-e54a-f11ca06b61e1" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d651d0bf-300c-4b1d-9e3c-03310b71287c is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
prompt
0Write python code to generate a pandas datafra...
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " prompt\n", - "0 Write python code to generate a pandas datafra...\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "prompt = \"\"\"\\\n", "Write python code to generate a pandas dataframe based on the requirements:\n", @@ -248,73 +169,7 @@ "id": "miDe3K4GNvOo", "outputId": "f2039e80-5ad7-4551-f8b2-7ef714a89d63" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d5c0725d-9070-4712-adfd-8a9bd86eefc3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4eb581a3-7f97-411a-bee1-91e8c150cef4 is DONE. 8 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f3d5503d-a3e7-49ce-b985-5ffbdbd856e3 is DONE. 2 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8ef76041-f077-4a05-bc03-63e6983ef853 is DONE. 332 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "import pandas as pd\n", - "from faker import Faker\n", - "\n", - "fake = Faker('es_ES')\n", - "result_df = pd.DataFrame({\n", - " 'Name': [fake.name() for _ in range(100)],\n", - " 'Age': [fake.random_int(min=18, max=65) for _ in range(100)],\n", - " 'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]\n", - "})\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "max_tries = 5\n", "for i in range(max_tries):\n", @@ -366,342 +221,7 @@ "id": "GODcPwX2PBEu", "outputId": "dec4c872-c464-49e4-cd7f-9442fc977d18" }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"execution_context\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"Renata Pla Cases\",\n \"Guiomar Carnero-Paz\",\n \"Luciano Garmendia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 18,\n \"max\": 64,\n \"num_unique_values\": 39,\n \"samples\": [\n 56,\n 31,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Male\",\n \"Non-binary\",\n \"Female\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameAgeGender
0Pastora Acuña Company21Male
1León Reig-Salom39Non-binary
2Aura Tomás Llobet30Female
3Vicente Correa Palomar64Female
4Benito del Fuster34Female
............
95Eduardo Cabrera27Non-binary
96Nazaret de Izaguirre40Non-binary
97Manuela Agullo Bustamante27Female
98Eugenio Mateo Naranjo Blazquez36Non-binary
99Heriberto Vicens Baeza53Female
\n", - "

100 rows × 3 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " Name Age Gender\n", - "0 Pastora Acuña Company 21 Male\n", - "1 León Reig-Salom 39 Non-binary\n", - "2 Aura Tomás Llobet 30 Female\n", - "3 Vicente Correa Palomar 64 Female\n", - "4 Benito del Fuster 34 Female\n", - ".. ... ... ...\n", - "95 Eduardo Cabrera 27 Non-binary\n", - "96 Nazaret de Izaguirre 40 Non-binary\n", - "97 Manuela Agullo Bustamante 27 Female\n", - "98 Eugenio Mateo Naranjo Blazquez 36 Non-binary\n", - "99 Heriberto Vicens Baeza 53 Female\n", - "\n", - "[100 rows x 3 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "execution_context = {}\n", "exec(code, execution_context)\n", @@ -726,21 +246,7 @@ "id": "n-BsGciNqSwU", "outputId": "996e5639-a49c-4542-a0dc-ede450e0eb6d" }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", "def data_generator(id):\n", @@ -770,20 +276,7 @@ "id": "Odkmev9nsYqA", "outputId": "4aa7a1fd-0c0d-4412-f326-a20e19f583b5" }, - "outputs": [ - { - "data": { - "text/html": [ - "Load job 40b9c3a8-27fc-40a8-9edf-4aa2e0fec332 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "desired_num_rows = 1_000_000 # 1 million rows\n", "batch_size = 100 # used in the prompt\n", @@ -803,20 +296,7 @@ "id": "UyBhlJFVsmQC", "outputId": "29748df5-673b-4320-bb1f-53abaace3b81" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9dd49b50-2dbf-4351-b9ad-b17aeb627caf is DONE. 240.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "df[\"json_data\"] = df[\"row_id\"].apply(data_generator)" ] @@ -839,262 +319,7 @@ "id": "6p3eM21qvRvy", "outputId": "333f4e49-a555-4d2f-b527-02142782b3a7" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 3f8d2133-b01d-402d-a731-79592810ca1c is DONE. 63.7 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4a613aa3-6323-4914-8e34-93323885d458 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0deb03be-725b-40b4-a7a1-1023b0477f35 is DONE. 40.1 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameAgeGender
0Eloy Santiago-Aragón31Male
1Amanda Mata Abril20Non-binary
2Danilo Velázquez Salcedo58Male
3Leyre Alba España61Female
4Paulina Amores Pastor41Male
5Jorge Cuadrado Mena50Female
6Chucho Catalán36Non-binary
7Vidal Benavente Lerma38Male
8Clementina Álamo32Female
9Petrona Roselló-Valls61Male
10Luís Camilo Sastre Marin45Male
11Gil Baudelio Carbajo Ordóñez58Non-binary
12David del Donoso44Female
13Dolores Arnau Ros21Non-binary
14Febe de León46Non-binary
15Ariadna Almazán34Female
16Blas Serna Aguiló24Non-binary
17Paulino Barreda Almeida59Female
18Eligio Valcárcel Tormo35Non-binary
19Toño Amador Torres Portillo48Female
20Florencia del Bejarano65Non-binary
21Clímaco Andreu Gómez18Male
22Xiomara Dominguez Solana35Female
23Leire Castilla Borrego19Non-binary
24Angelita Garmendia Carpio21Non-binary
\n", - "

25 rows × 3 columns

\n", - "
[1000000 rows x 3 columns in total]" - ], - "text/plain": [ - " Name Age Gender\n", - "0 Eloy Santiago-Aragón 31 Male\n", - "1 Amanda Mata Abril 20 Non-binary\n", - "2 Danilo Velázquez Salcedo 58 Male\n", - "3 Leyre Alba España 61 Female\n", - "4 Paulina Amores Pastor 41 Male\n", - "5 Jorge Cuadrado Mena 50 Female\n", - "6 Chucho Catalán 36 Non-binary\n", - "7 Vidal Benavente Lerma 38 Male\n", - "8 Clementina Álamo 32 Female\n", - "9 Petrona Roselló-Valls 61 Male\n", - "10 Luís Camilo Sastre Marin 45 Male\n", - "11 Gil Baudelio Carbajo Ordóñez 58 Non-binary\n", - "12 David del Donoso 44 Female\n", - "13 Dolores Arnau Ros 21 Non-binary\n", - "14 Febe de León 46 Non-binary\n", - "15 Ariadna Almazán 34 Female\n", - "16 Blas Serna Aguiló 24 Non-binary\n", - "17 Paulino Barreda Almeida 59 Female\n", - "18 Eligio Valcárcel Tormo 35 Non-binary\n", - "19 Toño Amador Torres Portillo 48 Female\n", - "20 Florencia del Bejarano 65 Non-binary\n", - "21 Clímaco Andreu Gómez 18 Male\n", - "22 Xiomara Dominguez Solana 35 Female\n", - "23 Leire Castilla Borrego 19 Non-binary\n", - "24 Angelita Garmendia Carpio 21 Non-binary\n", - "...\n", - "\n", - "[1000000 rows x 3 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sql = f\"\"\"\n", "WITH T0 AS ({df.sql}),\n", @@ -1126,6 +351,18 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index db51afd412..88633f8635 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -430,7 +430,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator()" + "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 254ac65358..31a47ea424 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -1614,7 +1614,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "q_a_model = GeminiTextGenerator()" + "q_a_model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb index 20d5b4161d..a15209aae4 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -61,14 +61,14 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "vFMjpPBo9aVv" + }, "source": [ "**Author:** Sudipto Guha (Google)\n", "\n", "**Last updated:** March 16th 2025" - ], - "metadata": { - "id": "vFMjpPBo9aVv" - } + ] }, { "cell_type": "markdown", @@ -136,6 +136,9 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "GqLjnm1hsKGU" + }, "source": [ "## Setup & initialization\n", "\n", @@ -144,10 +147,7 @@ "For [Vector embedding generation](https://cloud.google.com/bigquery/docs/generate-text-embedding#required_roles)\n", "\n", "For [Vector Index creation](https://cloud.google.com/bigquery/docs/vector-index#roles_and_permissions)" - ], - "metadata": { - "id": "GqLjnm1hsKGU" - } + ] }, { "cell_type": "markdown", @@ -198,17 +198,17 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "id": "b8bKCfIiooEV", "executionInfo": { + "elapsed": 2, "status": "ok", "timestamp": 1742191597773, - "user_tz": -480, - "elapsed": 2, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": -480 + }, + "id": "b8bKCfIiooEV" }, "outputs": [], "source": [ @@ -284,23 +284,23 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "id": "j3lmnsh7ooEW", "executionInfo": { + "elapsed": 2, "status": "ok", "timestamp": 1742191608487, - "user_tz": -480, - "elapsed": 2, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "j3lmnsh7ooEW", "outputId": "eb68daf5-5558-487a-91d2-4b4f9e476da0" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "WARNING: google.colab.auth.authenticate_user() is not supported in Colab Enterprise.\n" ] @@ -342,17 +342,17 @@ "cell_type": "code", "execution_count": 33, "metadata": { - "id": "R7STCS8xB5d2", "executionInfo": { + "elapsed": 947, "status": "ok", "timestamp": 1742195413800, - "user_tz": -480, - "elapsed": 947, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": -480 + }, + "id": "R7STCS8xB5d2" }, "outputs": [], "source": [ @@ -385,33 +385,33 @@ }, { "cell_type": "markdown", - "source": [ - "Partial ordering mode allows BigQuery DataFrames to push down many more row and column filters. On large clustered and partitioned tables, this can greatly reduce the number of bytes scanned and computation slots used. This [blog post](https://medium.com/google-cloud/introducing-partial-ordering-mode-for-bigquery-dataframes-bigframes-ec35841d95c0) goes over it in more detail." - ], "metadata": { "id": "iOFF9hrvs5WE" - } + }, + "source": [ + "Partial ordering mode allows BigQuery DataFrames to push down many more row and column filters. On large clustered and partitioned tables, this can greatly reduce the number of bytes scanned and computation slots used. This [blog post](https://medium.com/google-cloud/introducing-partial-ordering-mode-for-bigquery-dataframes-bigframes-ec35841d95c0) goes over it in more detail." + ] }, { "cell_type": "code", - "source": [ - "bf.options.bigquery.ordering_mode = \"partial\"" - ], + "execution_count": 4, "metadata": { - "id": "9Gil1Oaas7KA", "executionInfo": { + "elapsed": 2, "status": "ok", "timestamp": 1742191620533, - "user_tz": -480, - "elapsed": 2, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": -480 + }, + "id": "9Gil1Oaas7KA" }, - "execution_count": 4, - "outputs": [] + "outputs": [], + "source": [ + "bf.options.bigquery.ordering_mode = \"partial\"" + ] }, { "cell_type": "markdown", @@ -435,26 +435,26 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "id": "zDSwoBo1CU3G", "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { + "elapsed": 468, "status": "ok", "timestamp": 1742192516923, - "user_tz": -480, - "elapsed": 468, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "zDSwoBo1CU3G", "outputId": "83edbc2f-5a23-407b-8890-f968eb31be44" }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:3553: UserWarning: \u001b[93mReading cached table from 2025-03-17 06:07:09.526507+00:00 to avoid\n", "incompatibilies with previous reads of this table. To read the latest\n", @@ -472,35 +472,35 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "id": "tYDoaKgJChiq", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "executionInfo": { + "elapsed": 6697, "status": "ok", "timestamp": 1742192524632, - "user_tz": -480, - "elapsed": 6697, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "tYDoaKgJChiq", "outputId": "9174da29-a051-4a99-e38f-6a2b09cfe4e9" }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 6f15ad71-cc7b-49c1-90e9-274bea7afbb9 is DONE. 477.4 GB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -514,105 +514,33 @@ }, { "cell_type": "code", - "source": [ - "## take a look at the sample dataset\n", - "\n", - "publications.head(5)" - ], + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 556 }, - "id": "XmqdJInztzPl", "executionInfo": { + "elapsed": 6, "status": "ok", "timestamp": 1742191801044, - "user_tz": -480, - "elapsed": 6, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "XmqdJInztzPl", "outputId": "ae05f3a6-edeb-423a-c061-c416717e1ec5" }, - "execution_count": 11, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " publication_number title \\\n", - "0 AU-338190-S Compressor wheel \n", - "1 CN-100525651-C Method for processing egg products \n", - "2 TW-I725505-B Improved carbon molecular sieve adsorbent \n", - "3 EP-0248026-B1 A system for supplying strip to a processing line \n", - "4 MY-135762-A Method for producing acrylic acid \n", - "\n", - " title_translated abstract \\\n", - "0 False Newness and distinctiveness is claimed in the ... \n", - "1 False The invention discloses a processing method of... \n", - "2 False Disclosed herein are rapid cycle pressure swin... \n", - "3 False A system (10) for supplying strip material (S)... \n", - "4 False A PROCESS FOR THE FRACTIONAL CONDENSATION OF A... \n", - "\n", - " abstract_translated cpc \\\n", - "0 False [] \n", - "1 False [] \n", - "2 False [{'code': 'B01D2253/116', 'inventive': False, ... \n", - "3 False [{'code': 'B65H2701/37', 'inventive': False, '... \n", - "4 False [{'code': 'C07C51/50', 'inventive': True, 'fir... \n", - "\n", - " cpc_low \\\n", - "0 [] \n", - "1 [] \n", - "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", - "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", - "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", - "\n", - " cpc_inventive_low \\\n", - "0 [] \n", - "1 [] \n", - "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", - "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", - "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", - "\n", - " top_terms \\\n", - "0 ['compressor wheel' 'newness' 'distinctiveness... \n", - "1 ['egg' 'processing method' 'egg body' 'pack' '... \n", - "2 ['swing adsorption' 'pressure swing' 'molecula... \n", - "3 ['strip material' 'assembly' 'coil' 'take' 'pr... \n", - "4 ['acrylic acid' 'producing acrylic' 'stabilize... \n", - "\n", - " similar \\\n", - "0 [{'publication_number': 'AU-338190-S', 'applic... \n", - "1 [{'publication_number': 'CN-101396133-B', 'app... \n", - "2 [{'publication_number': 'EP-1867379-B1', 'appl... \n", - "3 [{'publication_number': 'EP-0248026-B1', 'appl... \n", - "4 [{'publication_number': 'SG-157371-A1', 'appli... \n", - "\n", - " url country \\\n", - "0 https://patents.google.com/patent/AU338190S Australia \n", - "1 https://patents.google.com/patent/CN100525651C China \n", - "2 https://patents.google.com/patent/TWI725505B Taiwan \n", - "3 https://patents.google.com/patent/EP0248026B1 European Patent Office \n", - "4 https://patents.google.com/patent/MY135762A Malaysia \n", - "\n", - " publication_description cited_by \\\n", - "0 Design [] \n", - "1 Granted Patent [] \n", - "2 Granted Patent or patent of addition [] \n", - "3 Granted patent [] \n", - "4 Granted patent / Utility model [] \n", - "\n", - " embedding_v1 \n", - "0 [ 5.2067090e-02 -1.5462303e-01 -1.3415462e-01 ... \n", - "1 [-0.05154578 -0.00437102 0.01365495 -0.168424... \n", - "2 [ 0.0163008 -0.20972364 0.02052403 -0.003073... \n", - "3 [-0.04377723 0.04111805 -0.0929429 0.043924... \n", - "4 [ 0.10407669 0.01262973 -0.22623734 -0.171453... " - ], + "application/vnd.google.colaboratory.intrinsic+json": { + "repr_error": "Function 'unique' has no kernel matching input types (list not null>>)", + "type": "dataframe", + "variable_name": "publications" + }, "text/html": [ "\n", "
\n", @@ -955,15 +883,87 @@ "
\n", " \n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "publications", - "repr_error": "Function 'unique' has no kernel matching input types (list not null>>)" - } + "text/plain": [ + " publication_number title \\\n", + "0 AU-338190-S Compressor wheel \n", + "1 CN-100525651-C Method for processing egg products \n", + "2 TW-I725505-B Improved carbon molecular sieve adsorbent \n", + "3 EP-0248026-B1 A system for supplying strip to a processing line \n", + "4 MY-135762-A Method for producing acrylic acid \n", + "\n", + " title_translated abstract \\\n", + "0 False Newness and distinctiveness is claimed in the ... \n", + "1 False The invention discloses a processing method of... \n", + "2 False Disclosed herein are rapid cycle pressure swin... \n", + "3 False A system (10) for supplying strip material (S)... \n", + "4 False A PROCESS FOR THE FRACTIONAL CONDENSATION OF A... \n", + "\n", + " abstract_translated cpc \\\n", + "0 False [] \n", + "1 False [] \n", + "2 False [{'code': 'B01D2253/116', 'inventive': False, ... \n", + "3 False [{'code': 'B65H2701/37', 'inventive': False, '... \n", + "4 False [{'code': 'C07C51/50', 'inventive': True, 'fir... \n", + "\n", + " cpc_low \\\n", + "0 [] \n", + "1 [] \n", + "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", + "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", + "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", + "\n", + " cpc_inventive_low \\\n", + "0 [] \n", + "1 [] \n", + "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", + "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", + "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", + "\n", + " top_terms \\\n", + "0 ['compressor wheel' 'newness' 'distinctiveness... \n", + "1 ['egg' 'processing method' 'egg body' 'pack' '... \n", + "2 ['swing adsorption' 'pressure swing' 'molecula... \n", + "3 ['strip material' 'assembly' 'coil' 'take' 'pr... \n", + "4 ['acrylic acid' 'producing acrylic' 'stabilize... \n", + "\n", + " similar \\\n", + "0 [{'publication_number': 'AU-338190-S', 'applic... \n", + "1 [{'publication_number': 'CN-101396133-B', 'app... \n", + "2 [{'publication_number': 'EP-1867379-B1', 'appl... \n", + "3 [{'publication_number': 'EP-0248026-B1', 'appl... \n", + "4 [{'publication_number': 'SG-157371-A1', 'appli... \n", + "\n", + " url country \\\n", + "0 https://patents.google.com/patent/AU338190S Australia \n", + "1 https://patents.google.com/patent/CN100525651C China \n", + "2 https://patents.google.com/patent/TWI725505B Taiwan \n", + "3 https://patents.google.com/patent/EP0248026B1 European Patent Office \n", + "4 https://patents.google.com/patent/MY135762A Malaysia \n", + "\n", + " publication_description cited_by \\\n", + "0 Design [] \n", + "1 Granted Patent [] \n", + "2 Granted Patent or patent of addition [] \n", + "3 Granted patent [] \n", + "4 Granted patent / Utility model [] \n", + "\n", + " embedding_v1 \n", + "0 [ 5.2067090e-02 -1.5462303e-01 -1.3415462e-01 ... \n", + "1 [-0.05154578 -0.00437102 0.01365495 -0.168424... \n", + "2 [ 0.0163008 -0.20972364 0.02052403 -0.003073... \n", + "3 [-0.04377723 0.04111805 -0.0929429 0.043924... \n", + "4 [ 0.10407669 0.01262973 -0.22623734 -0.171453... " + ] }, + "execution_count": 11, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } + ], + "source": [ + "## take a look at the sample dataset\n", + "\n", + "publications.head(5)" ] }, { @@ -979,35 +979,35 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "id": "li38q8FzDDMu", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "executionInfo": { + "elapsed": 4528, "status": "ok", "timestamp": 1742192047236, - "user_tz": -480, - "elapsed": 4528, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "li38q8FzDDMu", "outputId": "b8c1bd38-b484-4f71-bd38-927c8677d0c5" }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 127fb090-1c9e-4d7a-acdd-86f077a87b07 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -1018,64 +1018,53 @@ }, { "cell_type": "code", - "source": [ - "## rename abstract column to content as the desired column on which embedding will be generated\n", - "publications = publications[[\"publication_number\", \"title\", \"abstract\"]].rename(columns={'abstract': 'content'})\n", - "\n", - "## generate the embeddings\n", - "## takes ~2-3 mins to run\n", - "embedding = text_model.predict(publications)[[\"publication_number\", \"title\", \"content\", \"ml_generate_embedding_result\",\"ml_generate_embedding_status\"]]\n", - "\n", - "## filter out rows where the embedding generation failed. the embedding status value is empty if the embedding generation was successful\n", - "embedding = embedding[~embedding[\"ml_generate_embedding_status\"].isnull()]\n" - ], + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 139 }, - "id": "b5HHZob_u61B", "executionInfo": { + "elapsed": 126632, "status": "ok", "timestamp": 1742192656608, - "user_tz": -480, - "elapsed": 126632, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "b5HHZob_u61B", "outputId": "c9ecc5fd-5d11-4fd8-f59b-9dce4e12e371" }, - "execution_count": 19, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Load job b8079d70-7d99-4198-898f-2921915f305f is DONE. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 17338b11-420c-4d3d-bd55-0bba1247f705 is DONE. 8.9 MB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mJSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\u001b[0m\n", @@ -1083,59 +1072,67 @@ ] }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job ebf3eb36-3199-4551-ad07-5fa5abb200be is DONE. 20.0 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 9e9c5aae-9045-4750-a34e-c98493369a90 is DONE. 20.0 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } + ], + "source": [ + "## rename abstract column to content as the desired column on which embedding will be generated\n", + "publications = publications[[\"publication_number\", \"title\", \"abstract\"]].rename(columns={'abstract': 'content'})\n", + "\n", + "## generate the embeddings\n", + "## takes ~2-3 mins to run\n", + "embedding = text_model.predict(publications)[[\"publication_number\", \"title\", \"content\", \"ml_generate_embedding_result\",\"ml_generate_embedding_status\"]]\n", + "\n", + "## filter out rows where the embedding generation failed. the embedding status value is empty if the embedding generation was successful\n", + "embedding = embedding[~embedding[\"ml_generate_embedding_status\"].isnull()]\n" ] }, { "cell_type": "code", - "source": [ - "embedding.head(5)" - ], + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 464 }, - "id": "OIT5FbqAwqG5", "executionInfo": { + "elapsed": 6715, "status": "ok", "timestamp": 1742192727525, - "user_tz": -480, - "elapsed": 6715, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "OIT5FbqAwqG5", "outputId": "d04c994a-a0c8-44b0-e897-d871036eeb1f" }, - "execution_count": 20, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:238: AmbiguousWindowWarning: \u001b[93mWindow ordering may be ambiguous, this can cause unstable results.\u001b[0m\n", " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n", @@ -1144,63 +1141,31 @@ ] }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 1bc3517f-df67-456c-8d31-14a6432b8629 is DONE. 70.4 MB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job ae92602b-0eab-437f-a02d-102a4defa99a is DONE. 31.3 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " publication_number title \\\n", - "5753 HN-1996000102-A NEW PESTICIDES \n", - "8115 AU-325874-S Baby sling \n", - "5415 AU-2016256863-A1 Microbial compositions and methods for denitri... \n", - "8886 FR-2368509-A1 NEW DEODORANTS OR FRESHENERS AND COMPOSITIONS ... \n", - "5661 US-2006051255-A1 Gas generator \n", - "\n", - " content \\\n", - "5753 THE PRESENT INVENTION REFERS TO \n", - "8115 Adjustable baby sling with velcro. \n", - "5415 The present invention provides compositions an... \n", - "8886 Polyanionic polyamide salts comprising a conca... \n", - "5661 A gas generator insulated by a vacuum-jacket v... \n", - "\n", - " ml_generate_embedding_result \\\n", - "5753 [-0.02709213 0.0366395 0.03931784 -0.003942... \n", - "8115 [ 6.44167811e-02 -2.01051459e-02 -3.39564607e-... \n", - "5415 [-5.90537786e-02 2.38401629e-03 7.22754598e-... \n", - "8886 [-3.44522446e-02 5.64815439e-02 -1.35829514e-... \n", - "5661 [-1.50892800e-02 6.56989636e-03 2.34969519e-... \n", - "\n", - " ml_generate_embedding_status \n", - "5753 \n", - "8115 \n", - "5415 \n", - "8886 \n", - "5661 \n", - "\n", - "[5 rows x 5 columns]" - ], "text/html": [ "
\n", "