From 9a9f2bc4867e99e59fc1548497006d99883b1dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 30 Apr 2024 12:26:12 -0500 Subject: [PATCH 01/17] refactor: split `read_gbq_table` implementation into functions and move to separate module (#642) * refactor: split `read_gbq_table` implementation into functions and move to separate module add todos * refactor progress * add index_cols function * maybe ready for review * Update bigframes/session/__init__.py --- bigframes/session/__init__.py | 288 ++++--------- .../_io/{bigquery.py => bigquery/__init__.py} | 74 +--- .../session/_io/bigquery/read_gbq_table.py | 386 ++++++++++++++++++ tests/unit/session/test_io_bigquery.py | 16 - tests/unit/session/test_read_gbq_table.py | 37 ++ 5 files changed, 493 insertions(+), 308 deletions(-) rename bigframes/session/_io/{bigquery.py => bigquery/__init__.py} (74%) create mode 100644 bigframes/session/_io/bigquery/read_gbq_table.py create mode 100644 tests/unit/session/test_read_gbq_table.py diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 79febcc5d9..0f5aa19592 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -18,7 +18,6 @@ import copy import datetime -import itertools import logging import os import re @@ -43,7 +42,6 @@ # Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. import bigframes_vendored.ibis.backends.bigquery # noqa -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet import bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers @@ -62,7 +60,6 @@ import google.cloud.storage as storage # type: ignore import ibis import ibis.backends.bigquery as ibis_bigquery -import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import numpy as np import pandas @@ -80,7 +77,6 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile -import bigframes.core.guid as guid import bigframes.core.nodes as nodes from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order @@ -92,6 +88,7 @@ from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf from bigframes.functions.remote_function import remote_function as bigframes_rf import bigframes.session._io.bigquery as bigframes_io +import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.clients import bigframes.version @@ -692,59 +689,6 @@ def read_gbq_table( use_cache=use_cache, ) - def _get_snapshot_sql_and_primary_key( - self, - table: google.cloud.bigquery.table.Table, - *, - api_name: str, - use_cache: bool = True, - ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]: - """Create a read-only Ibis table expression representing a table. - - If we can get a total ordering from the table, such as via primary key - column(s), then return those too so that ordering generation can be - avoided. - """ - ( - snapshot_timestamp, - table, - ) = bigframes_io.get_snapshot_datetime_and_table_metadata( - self.bqclient, - table_ref=table.reference, - api_name=api_name, - cache=self._df_snapshot, - use_cache=use_cache, - ) - - if table.location.casefold() != self._location.casefold(): - raise ValueError( - f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" - ) - - # If there are primary keys defined, the query engine assumes these - # columns are unique, even if the constraint is not enforced. We make - # the same assumption and use these columns as the total ordering keys. - primary_keys = None - if ( - (table_constraints := getattr(table, "table_constraints", None)) is not None - and (primary_key := table_constraints.primary_key) is not None - # This will be False for either None or empty list. - # We want primary_keys = None if no primary keys are set. - and (columns := primary_key.columns) - ): - primary_keys = columns - - try: - table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table.reference, snapshot_timestamp) - ) - except google.api_core.exceptions.Forbidden as ex: - if "Drive credentials" in ex.message: - ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." - raise - - return table_expression, primary_keys - def _read_gbq_table( self, query: str, @@ -757,21 +701,47 @@ def _read_gbq_table( ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe + # --------------------------------- + # Validate and transform parameters + # --------------------------------- + if max_results and max_results <= 0: - raise ValueError("`max_results` should be a positive number.") + raise ValueError( + f"`max_results` should be a positive number, got {max_results}." + ) table_ref = bigquery.table.TableReference.from_string( query, default_project=self.bqclient.project ) - table = self.bqclient.get_table(table_ref) - (table_expression, primary_keys,) = self._get_snapshot_sql_and_primary_key( - table, api_name=api_name, use_cache=use_cache + # --------------------------------- + # Fetch table metadata and validate + # --------------------------------- + + (time_travel_timestamp, table,) = bf_read_gbq_table.get_table_metadata( + self.bqclient, + table_ref=table_ref, + api_name=api_name, + cache=self._df_snapshot, + use_cache=use_cache, ) - total_ordering_cols = primary_keys - if not index_col and primary_keys is not None: - index_col = primary_keys + if table.location.casefold() != self._location.casefold(): + raise ValueError( + f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" + ) + + # ----------------------------------------- + # Create Ibis table expression and validate + # ----------------------------------------- + + # Use a time travel to make sure the DataFrame is deterministic, even + # if the underlying table changes. + table_expression = bf_read_gbq_table.get_ibis_time_travel_table( + self.ibis_client, + table_ref, + time_travel_timestamp, + ) for key in columns: if key not in table_expression.columns: @@ -779,10 +749,22 @@ def _read_gbq_table( f"Column '{key}' of `columns` not found in this table." ) - if isinstance(index_col, str): - index_cols: List[str] = [index_col] - else: - index_cols = list(index_col) + # --------------------------------------- + # Create a non-default index and validate + # --------------------------------------- + + # TODO(b/337925142): Move index_cols creation to before we create the + # Ibis table expression so we don't have a "SELECT *" subquery in the + # query that checks for index uniqueness. + + index_cols, is_index_unique = bf_read_gbq_table.get_index_cols_and_uniqueness( + bqclient=self.bqclient, + ibis_client=self.ibis_client, + table=table, + table_expression=table_expression, + index_col=index_col, + api_name=api_name, + ) for key in index_cols: if key not in table_expression.columns: @@ -790,62 +772,33 @@ def _read_gbq_table( f"Column `{key}` of `index_col` not found in this table." ) + # TODO(b/337925142): We should push down column filters when we get the time + # travel table to avoid "SELECT *" subqueries. if columns: table_expression = table_expression.select([*index_cols, *columns]) - # If the index is unique and sortable, then we don't need to generate - # an ordering column. - ordering = None - if total_ordering_cols is not None: - # Note: currently, a table has a total ordering only when the - # primary key(s) are set on a table. The query engine assumes such - # columns are unique, even if not enforced. - ordering = order.ExpressionOrdering( - ordering_value_columns=tuple( - order.ascending_over(column_id) for column_id in total_ordering_cols - ), - total_ordering_columns=frozenset(total_ordering_cols), - ) - column_values = [table_expression[col] for col in table_expression.columns] - array_value = core.ArrayValue.from_ibis( - self, - table_expression, - columns=column_values, - hidden_ordering_columns=[], - ordering=ordering, - ) + # ---------------------------- + # Create ordering and validate + # ---------------------------- - elif len(index_cols) != 0: - # We have index columns, lets see if those are actually total_order_columns - ordering = order.ExpressionOrdering( - ordering_value_columns=tuple( - [order.ascending_over(column_id) for column_id in index_cols] - ), - total_ordering_columns=frozenset(index_cols), - ) - is_total_ordering = self._check_index_uniqueness( - table_expression, index_cols + if is_index_unique: + array_value = bf_read_gbq_table.to_array_value_with_total_ordering( + session=self, + table_expression=table_expression, + total_ordering_cols=index_cols, ) - if is_total_ordering: - column_values = [ - table_expression[col] for col in table_expression.columns - ] - array_value = core.ArrayValue.from_ibis( - self, - table_expression, - columns=column_values, - hidden_ordering_columns=[], - ordering=ordering, - ) - else: - array_value = self._create_total_ordering( - table_expression, table_rows=table.num_rows - ) else: - array_value = self._create_total_ordering( - table_expression, table_rows=table.num_rows + # Note: Even though we're adding a default ordering here, that's + # just so we have a deterministic total ordering. If the user + # specified a non-unique index, we still sort by that later. + array_value = bf_read_gbq_table.to_array_value_with_default_ordering( + session=self, table=table_expression, table_rows=table.num_rows ) + # ---------------------------------------------------- + # Create Block & default index if len(index_cols) == 0 + # ---------------------------------------------------- + value_columns = [col for col in array_value.column_ids if col not in index_cols] block = blocks.Block( array_value, @@ -862,27 +815,6 @@ def _read_gbq_table( df.sort_index() return df - def _check_index_uniqueness( - self, table: ibis_types.Table, index_cols: List[str] - ) -> bool: - distinct_table = table.select(*index_cols).distinct() - is_unique_sql = f"""WITH full_table AS ( - {self.ibis_client.compile(table)} - ), - distinct_table AS ( - {self.ibis_client.compile(distinct_table)} - ) - - SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, - (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` - """ - results, _ = self._start_query(is_unique_sql) - row = next(iter(results)) - - total_count = row["total_count"] - distinct_count = row["distinct_count"] - return total_count == distinct_count - def _read_bigquery_load_job( self, filepath_or_buffer: str | IO["bytes"], @@ -1462,66 +1394,6 @@ def _create_empty_temp_table( ) return bigquery.TableReference.from_string(table) - def _create_total_ordering( - self, - table: ibis_types.Table, - table_rows: Optional[int], - ) -> core.ArrayValue: - # Since this might also be used as the index, don't use the default - # "ordering ID" name. - - # For small tables, 64 bits is enough to avoid collisions, 128 bits will never ever collide no matter what - # Assume table is large if table row count is unknown - use_double_hash = ( - (table_rows is None) or (table_rows == 0) or (table_rows > 100000) - ) - - ordering_hash_part = guid.generate_guid("bigframes_ordering_") - ordering_hash_part2 = guid.generate_guid("bigframes_ordering_") - ordering_rand_part = guid.generate_guid("bigframes_ordering_") - - # All inputs into hash must be non-null or resulting hash will be null - str_values = list( - map(lambda col: _convert_to_nonnull_string(table[col]), table.columns) - ) - full_row_str = ( - str_values[0].concat(*str_values[1:]) - if len(str_values) > 1 - else str_values[0] - ) - full_row_hash = full_row_str.hash().name(ordering_hash_part) - # By modifying value slightly, we get another hash uncorrelated with the first - full_row_hash_p2 = (full_row_str + "_").hash().name(ordering_hash_part2) - # Used to disambiguate between identical rows (which will have identical hash) - random_value = ibis.random().name(ordering_rand_part) - - order_values = ( - [full_row_hash, full_row_hash_p2, random_value] - if use_double_hash - else [full_row_hash, random_value] - ) - - original_column_ids = table.columns - table_with_ordering = table.select( - itertools.chain(original_column_ids, order_values) - ) - - ordering = order.ExpressionOrdering( - ordering_value_columns=tuple( - order.ascending_over(col.get_name()) for col in order_values - ), - total_ordering_columns=frozenset(col.get_name() for col in order_values), - ) - columns = [table_with_ordering[col] for col in original_column_ids] - hidden_columns = [table_with_ordering[col.get_name()] for col in order_values] - return core.ArrayValue.from_ibis( - self, - table_with_ordering, - columns, - hidden_ordering_columns=hidden_columns, - ordering=ordering, - ) - def _ibis_to_temp_table( self, table: ibis_types.Table, @@ -2056,28 +1928,6 @@ def _can_cluster_bq(field: bigquery.SchemaField): ) -def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue: - col_type = column.type() - if ( - col_type.is_numeric() - or col_type.is_boolean() - or col_type.is_binary() - or col_type.is_temporal() - ): - result = column.cast(ibis_dtypes.String(nullable=True)) - elif col_type.is_geospatial(): - result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() - elif col_type.is_string(): - result = column - else: - # TO_JSON_STRING works with all data types, but isn't the most efficient - # Needed for JSON, STRUCT and ARRAY datatypes - result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore - # Escape backslashes and use backslash as delineator - escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore - return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped) - - def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: """ For backwards-compatibility, convert any previously client-side only diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery/__init__.py similarity index 74% rename from bigframes/session/_io/bigquery.py rename to bigframes/session/_io/bigquery/__init__.py index 94576cfa12..2cd2d8ff9a 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Private module: Helpers for I/O operations.""" +"""Private module: Helpers for BigQuery I/O operations.""" from __future__ import annotations @@ -23,7 +23,6 @@ import types from typing import Dict, Iterable, Optional, Sequence, Tuple, Union import uuid -import warnings import google.api_core.exceptions import google.cloud.bigquery as bigquery @@ -122,77 +121,6 @@ def table_ref_to_sql(table: bigquery.TableReference) -> str: return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" -def get_snapshot_datetime_and_table_metadata( - bqclient: bigquery.Client, - table_ref: bigquery.TableReference, - *, - api_name: str, - cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], - use_cache: bool = True, -) -> Tuple[datetime.datetime, bigquery.Table]: - cached_table = cache.get(table_ref) - if use_cache and cached_table is not None: - snapshot_timestamp, _ = cached_table - - # Cache hit could be unexpected. See internal issue 329545805. - # Raise a warning with more information about how to avoid the - # problems with the cache. - warnings.warn( - f"Reading cached table from {snapshot_timestamp} to avoid " - "incompatibilies with previous reads of this table. To read " - "the latest version, set `use_cache=False` or close the " - "current session with Session.close() or " - "bigframes.pandas.close_session().", - # There are many layers before we get to (possibly) the user's code: - # pandas.read_gbq_table - # -> with_default_session - # -> Session.read_gbq_table - # -> _read_gbq_table - # -> _get_snapshot_sql_and_primary_key - # -> get_snapshot_datetime_and_table_metadata - stacklevel=7, - ) - return cached_table - - # TODO(swast): It's possible that the table metadata is changed between now - # and when we run the CURRENT_TIMESTAMP() query to see when we can time - # travel to. Find a way to fetch the table metadata and BQ's current time - # atomically. - table = bqclient.get_table(table_ref) - - # TODO(b/336521938): Refactor to make sure we set the "bigframes-api" - # whereever we execute a query. - job_config = bigquery.QueryJobConfig() - job_config.labels["bigframes-api"] = api_name - snapshot_timestamp = list( - bqclient.query( - "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", - job_config=job_config, - ).result() - )[0][0] - cached_table = (snapshot_timestamp, table) - cache[table_ref] = cached_table - return cached_table - - -def create_snapshot_sql( - table_ref: bigquery.TableReference, current_timestamp: datetime.datetime -) -> str: - """Query a table via 'time travel' for consistent reads.""" - # If we have an anonymous query results table, it can't be modified and - # there isn't any BigQuery time travel. - if table_ref.dataset_id.startswith("_"): - return f"SELECT * FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}`" - - return textwrap.dedent( - f""" - SELECT * - FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}` - FOR SYSTEM_TIME AS OF TIMESTAMP({repr(current_timestamp.isoformat())}) - """ - ) - - def create_temp_table( bqclient: bigquery.Client, dataset: bigquery.DatasetReference, diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py new file mode 100644 index 0000000000..3235ca92e5 --- /dev/null +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -0,0 +1,386 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Private helpers for loading a BigQuery table as a BigQuery DataFrames DataFrame. +""" + +from __future__ import annotations + +import datetime +import itertools +import textwrap +import typing +from typing import Dict, Iterable, List, Optional, Tuple +import warnings + +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops +import google.api_core.exceptions +import google.cloud.bigquery as bigquery +import ibis +import ibis.backends +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.types as ibis_types + +import bigframes +import bigframes.clients +import bigframes.core as core +import bigframes.core.compile +import bigframes.core.guid as guid +import bigframes.core.ordering as order +import bigframes.dtypes +import bigframes.session._io.bigquery.read_gbq_table +import bigframes.session.clients +import bigframes.version + +# Avoid circular imports. +if typing.TYPE_CHECKING: + import bigframes.session + + +def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue: + col_type = column.type() + if ( + col_type.is_numeric() + or col_type.is_boolean() + or col_type.is_binary() + or col_type.is_temporal() + ): + result = column.cast(ibis_dtypes.String(nullable=True)) + elif col_type.is_geospatial(): + result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() + elif col_type.is_string(): + result = column + else: + # TO_JSON_STRING works with all data types, but isn't the most efficient + # Needed for JSON, STRUCT and ARRAY datatypes + result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore + # Escape backslashes and use backslash as delineator + escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore + return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped) + + +def get_table_metadata( + bqclient: bigquery.Client, + table_ref: google.cloud.bigquery.table.TableReference, + *, + api_name: str, + cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], + use_cache: bool = True, +) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]: + """Get the table metadata, either from cache or via REST API.""" + + cached_table = cache.get(table_ref) + if use_cache and cached_table is not None: + snapshot_timestamp, _ = cached_table + + # Cache hit could be unexpected. See internal issue 329545805. + # Raise a warning with more information about how to avoid the + # problems with the cache. + warnings.warn( + f"Reading cached table from {snapshot_timestamp} to avoid " + "incompatibilies with previous reads of this table. To read " + "the latest version, set `use_cache=False` or close the " + "current session with Session.close() or " + "bigframes.pandas.close_session().", + # There are many layers before we get to (possibly) the user's code: + # pandas.read_gbq_table + # -> with_default_session + # -> Session.read_gbq_table + # -> _read_gbq_table + # -> _get_snapshot_sql_and_primary_key + # -> get_snapshot_datetime_and_table_metadata + stacklevel=7, + ) + return cached_table + + # TODO(swast): It's possible that the table metadata is changed between now + # and when we run the CURRENT_TIMESTAMP() query to see when we can time + # travel to. Find a way to fetch the table metadata and BQ's current time + # atomically. + table = bqclient.get_table(table_ref) + + # TODO(b/336521938): Refactor to make sure we set the "bigframes-api" + # whereever we execute a query. + job_config = bigquery.QueryJobConfig() + job_config.labels["bigframes-api"] = api_name + snapshot_timestamp = list( + bqclient.query( + "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", + job_config=job_config, + ).result() + )[0][0] + cached_table = (snapshot_timestamp, table) + cache[table_ref] = cached_table + return cached_table + + +def _create_time_travel_sql( + table_ref: bigquery.TableReference, time_travel_timestamp: datetime.datetime +) -> str: + """Query a table via 'time travel' for consistent reads.""" + # If we have an anonymous query results table, it can't be modified and + # there isn't any BigQuery time travel. + if table_ref.dataset_id.startswith("_"): + return f"SELECT * FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}`" + + return textwrap.dedent( + f""" + SELECT * + FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}` + FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())}) + """ + ) + + +def get_ibis_time_travel_table( + ibis_client: ibis.BaseBackend, + table_ref: bigquery.TableReference, + time_travel_timestamp: datetime.datetime, +) -> ibis_types.Table: + try: + return ibis_client.sql( + _create_time_travel_sql(table_ref, time_travel_timestamp) + ) + except google.api_core.exceptions.Forbidden as ex: + # Ibis does a dry run to get the types of the columns from the SQL. + if "Drive credentials" in ex.message: + ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." + raise + + +def _check_index_uniqueness( + bqclient: bigquery.Client, + ibis_client: ibis.BaseBackend, + table: ibis_types.Table, + index_cols: List[str], + api_name: str, +) -> bool: + distinct_table = table.select(*index_cols).distinct() + is_unique_sql = f"""WITH full_table AS ( + {ibis_client.compile(table)} + ), + distinct_table AS ( + {ibis_client.compile(distinct_table)} + ) + + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` + """ + job_config = bigquery.QueryJobConfig() + job_config.labels["bigframes-api"] = api_name + results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) + row = next(iter(results)) + + total_count = row["total_count"] + distinct_count = row["distinct_count"] + return total_count == distinct_count + + +def _get_primary_keys( + table: bigquery.table.Table, +) -> List[str]: + """Get primary keys from table if they are set.""" + + primary_keys: List[str] = [] + if ( + (table_constraints := getattr(table, "table_constraints", None)) is not None + and (primary_key := table_constraints.primary_key) is not None + # This will be False for either None or empty list. + # We want primary_keys = None if no primary keys are set. + and (columns := primary_key.columns) + ): + primary_keys = columns if columns is not None else [] + + return primary_keys + + +def get_index_cols_and_uniqueness( + bqclient: bigquery.Client, + ibis_client: ibis.BaseBackend, + table: bigquery.table.Table, + table_expression: ibis_types.Table, + index_col: Iterable[str] | str, + api_name: str, +) -> Tuple[List[str], bool]: + """ + If we can get a total ordering from the table, such as via primary key + column(s), then return those too so that ordering generation can be + avoided. + """ + + # Transform index_col -> index_cols so we have a variable that is + # always a list of column names (possibly empty). + if isinstance(index_col, str): + index_cols: List[str] = [index_col] + else: + index_cols = list(index_col) + + # If the isn't an index selected, use the primary keys of the table as the + # index. If there are no primary keys, we'll return an empty list. + if len(index_cols) == 0: + index_cols = _get_primary_keys(table) + + # TODO(b/335727141): If table has clustering/partitioning, fail if + # index_cols is empty. + + # If there are primary keys defined, the query engine assumes these + # columns are unique, even if the constraint is not enforced. We make + # the same assumption and use these columns as the total ordering keys. + is_index_unique = len(index_cols) != 0 + else: + is_index_unique = _check_index_uniqueness( + bqclient=bqclient, + ibis_client=ibis_client, + # TODO(b/337925142): Avoid a "SELECT *" subquery here by using + # _create_time_travel_sql with just index_cols. + table=table_expression, + index_cols=index_cols, + api_name=api_name, + ) + + return index_cols, is_index_unique + + +def get_time_travel_datetime_and_table_metadata( + bqclient: bigquery.Client, + table_ref: bigquery.TableReference, + *, + api_name: str, + cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], + use_cache: bool = True, +) -> Tuple[datetime.datetime, bigquery.Table]: + cached_table = cache.get(table_ref) + if use_cache and cached_table is not None: + snapshot_timestamp, _ = cached_table + + # Cache hit could be unexpected. See internal issue 329545805. + # Raise a warning with more information about how to avoid the + # problems with the cache. + warnings.warn( + f"Reading cached table from {snapshot_timestamp} to avoid " + "incompatibilies with previous reads of this table. To read " + "the latest version, set `use_cache=False` or close the " + "current session with Session.close() or " + "bigframes.pandas.close_session().", + # There are many layers before we get to (possibly) the user's code: + # pandas.read_gbq_table + # -> with_default_session + # -> Session.read_gbq_table + # -> _read_gbq_table + # -> _get_snapshot_sql_and_primary_key + # -> get_snapshot_datetime_and_table_metadata + stacklevel=7, + ) + return cached_table + + # TODO(swast): It's possible that the table metadata is changed between now + # and when we run the CURRENT_TIMESTAMP() query to see when we can time + # travel to. Find a way to fetch the table metadata and BQ's current time + # atomically. + table = bqclient.get_table(table_ref) + + # TODO(b/336521938): Refactor to make sure we set the "bigframes-api" + # whereever we execute a query. + job_config = bigquery.QueryJobConfig() + job_config.labels["bigframes-api"] = api_name + snapshot_timestamp = list( + bqclient.query( + "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", + job_config=job_config, + ).result() + )[0][0] + cached_table = (snapshot_timestamp, table) + cache[table_ref] = cached_table + return cached_table + + +def to_array_value_with_total_ordering( + session: bigframes.session.Session, + table_expression: ibis_types.Table, + total_ordering_cols: List[str], +) -> core.ArrayValue: + """Create an ArrayValue, assuming we already have a total ordering.""" + ordering = order.ExpressionOrdering( + ordering_value_columns=tuple( + order.ascending_over(column_id) for column_id in total_ordering_cols + ), + total_ordering_columns=frozenset(total_ordering_cols), + ) + column_values = [table_expression[col] for col in table_expression.columns] + return core.ArrayValue.from_ibis( + session, + table_expression, + columns=column_values, + hidden_ordering_columns=[], + ordering=ordering, + ) + + +def to_array_value_with_default_ordering( + session: bigframes.session.Session, + table: ibis_types.Table, + table_rows: Optional[int], +) -> core.ArrayValue: + """Create an ArrayValue with a deterministic default ordering.""" + # Since this might also be used as the index, don't use the default + # "ordering ID" name. + + # For small tables, 64 bits is enough to avoid collisions, 128 bits will never ever collide no matter what + # Assume table is large if table row count is unknown + use_double_hash = (table_rows is None) or (table_rows == 0) or (table_rows > 100000) + + ordering_hash_part = guid.generate_guid("bigframes_ordering_") + ordering_hash_part2 = guid.generate_guid("bigframes_ordering_") + ordering_rand_part = guid.generate_guid("bigframes_ordering_") + + # All inputs into hash must be non-null or resulting hash will be null + str_values = list( + map(lambda col: _convert_to_nonnull_string(table[col]), table.columns) + ) + full_row_str = ( + str_values[0].concat(*str_values[1:]) if len(str_values) > 1 else str_values[0] + ) + full_row_hash = full_row_str.hash().name(ordering_hash_part) + # By modifying value slightly, we get another hash uncorrelated with the first + full_row_hash_p2 = (full_row_str + "_").hash().name(ordering_hash_part2) + # Used to disambiguate between identical rows (which will have identical hash) + random_value = ibis.random().name(ordering_rand_part) + + order_values = ( + [full_row_hash, full_row_hash_p2, random_value] + if use_double_hash + else [full_row_hash, random_value] + ) + + original_column_ids = table.columns + table_with_ordering = table.select( + itertools.chain(original_column_ids, order_values) + ) + + ordering = order.ExpressionOrdering( + ordering_value_columns=tuple( + order.ascending_over(col.get_name()) for col in order_values + ), + total_ordering_columns=frozenset(col.get_name() for col in order_values), + ) + columns = [table_with_ordering[col] for col in original_column_ids] + hidden_columns = [table_with_ordering[col.get_name()] for col in order_values] + return core.ArrayValue.from_ibis( + session, + table_with_ordering, + columns, + hidden_ordering_columns=hidden_columns, + ordering=ordering, + ) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 406de2b88e..eed1acb5a3 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -137,22 +137,6 @@ def test_create_job_configs_labels_length_limit_met(): assert "source" in labels.keys() -def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): - table_ref = bigquery.TableReference.from_string( - "my-test-project._e8166e0cdb.anonbb92cd" - ) - - sql = bigframes.session._io.bigquery.create_snapshot_sql( - table_ref, datetime.datetime.now(datetime.timezone.utc) - ) - - # Anonymous query results tables don't support time travel. - assert "SYSTEM_TIME" not in sql - - # Need fully-qualified table name. - assert "`my-test-project`.`_e8166e0cdb`.`anonbb92cd`" in sql - - def test_create_temp_table_default_expiration(): """Make sure the created table has an expiration.""" bqclient = mock.create_autospec(bigquery.Client) diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py new file mode 100644 index 0000000000..1d09769aec --- /dev/null +++ b/tests/unit/session/test_read_gbq_table.py @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for read_gbq_table helper functions.""" + +import datetime + +import google.cloud.bigquery as bigquery + +import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table + + +def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): + table_ref = bigquery.TableReference.from_string( + "my-test-project._e8166e0cdb.anonbb92cd" + ) + + sql = bf_read_gbq_table._create_time_travel_sql( + table_ref, datetime.datetime.now(datetime.timezone.utc) + ) + + # Anonymous query results tables don't support time travel. + assert "SYSTEM_TIME" not in sql + + # Need fully-qualified table name. + assert "`my-test-project`.`_e8166e0cdb`.`anonbb92cd`" in sql From 9963f85b84c3b3c681447ab79e22ac93ac48349c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 30 Apr 2024 13:50:22 -0700 Subject: [PATCH 02/17] feat: add the `bigframes.bigquery` sub-package with a `bigframes.bigquery.array_length` function (#630) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: creats bigquery namespace and adds bigquery.array_length function * add docs * minor fix * fixing docs * add more doc tests * sentence-case * TODO for null arrays --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/bigquery/__init__.py | 60 +++++++++++++++++++++ docs/reference/bigframes.bigquery/index.rst | 9 ++++ docs/reference/bigframes/index.rst | 1 + docs/reference/index.rst | 1 + docs/templates/toc.yml | 4 ++ tests/system/small/bigquery/__init__.py | 13 +++++ tests/system/small/bigquery/test_array.py | 32 +++++++++++ 7 files changed, 120 insertions(+) create mode 100644 bigframes/bigquery/__init__.py create mode 100644 docs/reference/bigframes.bigquery/index.rst create mode 100644 tests/system/small/bigquery/__init__.py create mode 100644 tests/system/small/bigquery/test_array.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py new file mode 100644 index 0000000000..197e0a83b5 --- /dev/null +++ b/bigframes/bigquery/__init__.py @@ -0,0 +1,60 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""This module integrates BigQuery built-in functions for use with DataFrame objects, +such as array functions: +https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ + + +from __future__ import annotations + +import typing + +import bigframes.operations as ops + +if typing.TYPE_CHECKING: + import bigframes.series as series + + +def array_length(series: series.Series) -> series.Series: + """Compute the length of each array element in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) + >>> bbq.array_length(s) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + You can also apply this function directly to Series. + + >>> s.apply(bbq.array_length, by_row=False) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + Returns: + bigframes.series.Series: A Series of integer values indicating + the length of each element in the Series. + + """ + return series._apply_unary_op(ops.len_op) diff --git a/docs/reference/bigframes.bigquery/index.rst b/docs/reference/bigframes.bigquery/index.rst new file mode 100644 index 0000000000..03e9bb48a4 --- /dev/null +++ b/docs/reference/bigframes.bigquery/index.rst @@ -0,0 +1,9 @@ + +=========================== +BigQuery Built-in Functions +=========================== + +.. automodule:: bigframes.bigquery + :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes/index.rst b/docs/reference/bigframes/index.rst index 76d64444fa..d26db18c96 100644 --- a/docs/reference/bigframes/index.rst +++ b/docs/reference/bigframes/index.rst @@ -1,4 +1,5 @@ +============ Core objects ============ diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c790831db1..387e9b5ced 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -10,3 +10,4 @@ packages. bigframes/index bigframes.pandas/index bigframes.ml/index + bigframes.bigquery/index diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 4573296ec3..80ccc01fac 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -189,5 +189,9 @@ uid: bigframes.ml.remote.VertexAIModel name: remote name: bigframes.ml + - items: + - name: BigQuery built-in functions + uid: bigframes.bigquery + name: bigframes.bigquery name: BigQuery DataFrames status: beta diff --git a/tests/system/small/bigquery/__init__.py b/tests/system/small/bigquery/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/system/small/bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py new file mode 100644 index 0000000000..a91669cd88 --- /dev/null +++ b/tests/system/small/bigquery/test_array.py @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_array_length(): + series = bpd.Series([["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]]) + # TODO(b/336880368): Allow for NULL values to be input for ARRAY columns. + # Once we actually store NULL values, this will be NULL where the input is NULL. + expected = pd.Series([3, 2, 0, 0, 1]) + pd.testing.assert_series_equal( + bbq.array_length(series).to_pandas(), + expected, + check_dtype=False, + check_index_type=False, + ) From c67e501a4958ac097216cc1c0a9d5c1530c87ae5 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Wed, 1 May 2024 12:20:16 -0700 Subject: [PATCH 03/17] docs: fix the Palm2TextGenerator output token size (#649) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #333480290 🦕 --- bigframes/ml/llm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index b455e35b67..9fa9a9acd0 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -233,7 +233,7 @@ def predict( max_output_tokens (int, default 128): Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. - Default 128. For the 'text-bison' model, possible values are in the range [1, 1024]. For the 'text-bison-32k' model, possible values are in the range [1, 8196]. + Default 128. For the 'text-bison' model, possible values are in the range [1, 1024]. For the 'text-bison-32k' model, possible values are in the range [1, 8192]. Please ensure that the specified value for max_output_tokens is within the appropriate range for the model being used. top_k (int, default 40): @@ -269,10 +269,10 @@ def predict( if ( self.model_name == _TEXT_GENERATOR_BISON_32K_ENDPOINT - and max_output_tokens not in range(1, 8197) + and max_output_tokens not in range(1, 8193) ): raise ValueError( - f"max_output_token must be [1, 8196] for TextBison 32k model, but is {max_output_tokens}." + f"max_output_token must be [1, 8192] for TextBison 32k model, but is {max_output_tokens}." ) if top_k not in range(1, 41): From 3867390229e47cae631f43d91e866632bd2b31aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 1 May 2024 16:26:32 -0500 Subject: [PATCH 04/17] test: explicitly use US location for session in tests (#650) This avoids some warnings we see and ignore in our tests. It might also address some flakiness in `tests/system/small/ml/test_llm.py::test_create_text_generator_model` and `tests/system/small/ml/test_llm.py::test_create_text_generator_32k_model`, but the root cause of that flakiness is still TBD. --- tests/system/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 70ff6eee39..a9fb8cb3f5 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -128,7 +128,10 @@ def resourcemanager_client( @pytest.fixture(scope="session") def session() -> bigframes.Session: - return bigframes.Session() + context = bigframes.BigQueryOptions( + location="US", + ) + return bigframes.Session(context=context) @pytest.fixture(scope="session") From e4f13c3633b90e32d3171976d8b27ed10049882f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 1 May 2024 21:35:21 +0000 Subject: [PATCH 05/17] fix: use explicit session in `PaLM2TextGenerator` (#651) This is to avoid running into conflict with the global session which may have different options (e.g. location, connection etc.) set. --- bigframes/ml/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 9fa9a9acd0..4b07524194 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -144,7 +144,7 @@ def _from_bq( kwargs: dict = {} last_fitting = model.training_runs[-1]["trainingOptions"] - dummy_text_generator = cls() + dummy_text_generator = cls(session=session) for bf_param, _ in dummy_text_generator.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) if bqml_param in last_fitting: From 8e4616b896f4e0d13d8bb0424c89335d3a1fe697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 2 May 2024 15:39:12 -0500 Subject: [PATCH 06/17] fix: don't raise UnknownLocationWarning for US or EU multi-regions (#653) --- bigframes/constants.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigframes/constants.py b/bigframes/constants.py index c6d8f3acc2..4778eb9c9e 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -31,6 +31,7 @@ # https://cloud.google.com/bigquery/docs/locations ALL_BIGQUERY_LOCATIONS = frozenset( { + # regions "us-east5", "us-south1", "us-central1", @@ -74,6 +75,9 @@ "me-central1", "me-west1", "africa-south1", + # multi-regions + "US", + "EU", } ) From 81d1262a40c133017c6debe89506d66aab7bb0c5 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Thu, 2 May 2024 16:47:30 -0500 Subject: [PATCH 07/17] feat: add `ARIMAPlus.coef_` property exposing `ML.ARIMA_COEFFICIENTS` functionality (#585) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * create_single_timeseries_forecasting_model_test.py code sample * fix: forecast method to forecast time series * pair programming PR draft creation * feature: insoect coefficients * update tests for new feature * add arima_model.coef_ to fetch coefficients * updated tests for coefficients feature * feature update for arima_coefficients * updates to output cols * docstring updates --------- Co-authored-by: Salem Boyland Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/core.py | 5 +++ bigframes/ml/forecasting.py | 21 ++++++++++++ bigframes/ml/sql.py | 4 +++ tests/system/large/ml/test_forecasting.py | 42 ++++++++++++++++------- tests/unit/ml/test_sql.py | 10 ++++++ 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 7b4638157e..168bc584f7 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -205,6 +205,11 @@ def arima_evaluate(self, show_all_candidate_models: bool = False): return self._session.read_gbq(sql) + def arima_coefficients(self) -> bpd.DataFrame: + sql = self._model_manipulation_sql_generator.ml_arima_coefficients() + + return self._session.read_gbq(sql) + def centroids(self) -> bpd.DataFrame: assert self._model.model_type == "KMEANS" diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 5bd01c8826..783e7741b8 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -269,6 +269,27 @@ def predict( options={"horizon": horizon, "confidence_level": confidence_level} ) + @property + def coef_( + self, + ) -> bpd.DataFrame: + """Inspect the coefficients of the model. + + ..note:: + + Output matches that of the ML.ARIMA_COEFFICIENTS function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-arima-coefficients + for the outputs relevant to this model type. + + Returns: + bigframes.dataframe.DataFrame: + A DataFrame with the coefficients for the model. + """ + + if not self._bqml_model: + raise RuntimeError("A model must be fitted before inspect coefficients") + return self._bqml_model.arima_coefficients() + def detect_anomalies( self, X: Union[bpd.DataFrame, bpd.Series], diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 3679be16c6..ea693e3437 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -318,6 +318,10 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str: return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, ({source_sql}))""" + def ml_arima_coefficients(self) -> str: + """Encode ML.ARIMA_COEFFICIENTS for BQML""" + return f"""SELECT * FROM ML.ARIMA_COEFFICIENTS(MODEL `{self._model_name}`)""" + # ML evaluation TVFs def ml_llm_evaluate( self, source_df: bpd.DataFrame, task_type: Optional[str] = None diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index b333839e2e..ef74398c2e 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -13,6 +13,7 @@ # limitations under the License. import pandas as pd +import pytest from bigframes.ml import forecasting @@ -31,15 +32,22 @@ ] -def test_arima_plus_model_fit_score( - time_series_df_default_index, dataset_id, new_time_series_df -): +@pytest.fixture(scope="module") +def arima_model(time_series_df_default_index): model = forecasting.ARIMAPlus() X_train = time_series_df_default_index[["parsed_date"]] y_train = time_series_df_default_index[["total_visits"]] model.fit(X_train, y_train) + return model + + +def test_arima_plus_model_fit_score( + dataset_id, + new_time_series_df, + arima_model, +): - result = model.score( + result = arima_model.score( new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] ).to_pandas() expected = pd.DataFrame( @@ -56,29 +64,39 @@ def test_arima_plus_model_fit_score( pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load to ensure configuration was kept - reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True) + reloaded_model = arima_model.to_gbq( + f"{dataset_id}.temp_arima_plus_model", replace=True + ) assert ( f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name ) -def test_arima_plus_model_fit_summary(time_series_df_default_index, dataset_id): - model = forecasting.ARIMAPlus() - X_train = time_series_df_default_index[["parsed_date"]] - y_train = time_series_df_default_index[["total_visits"]] - model.fit(X_train, y_train) +def test_arima_plus_model_fit_summary(dataset_id, arima_model): - result = model.summary() + result = arima_model.summary() assert result.shape == (1, 12) assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) # save, load to ensure configuration was kept - reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True) + reloaded_model = arima_model.to_gbq( + f"{dataset_id}.temp_arima_plus_model", replace=True + ) assert ( f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name ) +def test_arima_coefficients(arima_model): + got = arima_model.coef_ + expected_columns = { + "ar_coefficients", + "ma_coefficients", + "intercept_or_drift", + } + assert set(got.columns) == expected_columns + + def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id): model = forecasting.ARIMAPlus( horizon=100, diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 1a5e8fe962..4dd90b2c4a 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -47,6 +47,16 @@ def mock_df(): return mock_df +def test_ml_arima_coefficients( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_arima_coefficients() + assert ( + sql + == """SELECT * FROM ML.ARIMA_COEFFICIENTS(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + ) + + def test_options_correct(base_sql_generator: ml_sql.BaseSqlGenerator): sql = base_sql_generator.options( model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6 From 73064dd2aa1ece5de8f5849a0fd337d0ba677404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 2 May 2024 17:51:14 -0500 Subject: [PATCH 08/17] feat: raise `NoDefaultIndexError` from `read_gbq` on clustered/partitioned tables with no `index_col` or `filters` set (#631) This should help customers better discover the best practices for working with large tables. feat: support `index_col=False` in `read_csv` and `engine="bigquery"` --- bigframes/__init__.py | 4 + bigframes/core/blocks.py | 10 + bigframes/enums.py | 29 +++ bigframes/exceptions.py | 8 + bigframes/pandas/__init__.py | 15 +- bigframes/session/__init__.py | 74 ++++-- .../session/_io/bigquery/read_gbq_table.py | 64 ++++- docs/reference/bigframes/enums.rst | 8 + docs/reference/bigframes/exceptions.rst | 8 + docs/reference/bigframes/index.rst | 2 + docs/templates/toc.yml | 4 + tests/system/small/test_dataframe_io.py | 4 +- tests/system/small/test_session.py | 62 ++--- tests/unit/session/test_session.py | 222 +++++++++++++++++- .../bigframes_vendored/pandas/io/gbq.py | 38 ++- .../pandas/io/parsers/readers.py | 9 +- 16 files changed, 478 insertions(+), 83 deletions(-) create mode 100644 bigframes/enums.py create mode 100644 docs/reference/bigframes/enums.rst create mode 100644 docs/reference/bigframes/exceptions.rst diff --git a/bigframes/__init__.py b/bigframes/__init__.py index bd1476957b..240608ebc2 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -17,6 +17,8 @@ from bigframes._config import option_context, options from bigframes._config.bigquery_options import BigQueryOptions from bigframes.core.global_session import close_session, get_global_session +import bigframes.enums as enums +import bigframes.exceptions as exceptions from bigframes.session import connect, Session from bigframes.version import __version__ @@ -25,6 +27,8 @@ "BigQueryOptions", "get_global_session", "close_session", + "enums", + "exceptions", "connect", "Session", "__version__", diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 4ff8a1836b..402581eb6f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -116,10 +116,20 @@ def __init__( raise ValueError( f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) + + # If no index columns are set, create one. + # + # Note: get_index_cols_and_uniqueness in + # bigframes/session/_io/bigquery/read_gbq_table.py depends on this + # being as sequential integer index column. If this default behavior + # ever changes, please also update get_index_cols_and_uniqueness so + # that users who explicitly request a sequential integer index can + # still get one. if len(index_columns) == 0: new_index_col_id = guid.generate_guid() expr = expr.promote_offsets(new_index_col_id) index_columns = [new_index_col_id] + self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple self._index_labels = ( diff --git a/bigframes/enums.py b/bigframes/enums.py new file mode 100644 index 0000000000..4bec75f5df --- /dev/null +++ b/bigframes/enums.py @@ -0,0 +1,29 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Public enums used across BigQuery DataFrames.""" + +# NOTE: This module should not depend on any others in the package. + + +import enum + + +class DefaultIndexKind(enum.Enum): + """Sentinel values used to override default indexing behavior.""" + + #: Use consecutive integers as the index. This is ``0``, ``1``, ``2``, ..., + #: ``n - 3``, ``n - 2``, ``n - 1``, where ``n`` is the number of items in + #: the index. + SEQUENTIAL_INT64 = enum.auto() diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 62122e79d2..d179914983 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -12,6 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Public exceptions and warnings used across BigQuery DataFrames.""" + +# NOTE: This module should not depend on any others in the package. + class UnknownLocationWarning(Warning): """The location is set to an unknown value.""" + + +class NoDefaultIndexError(ValueError): + """Unable to create a default index.""" diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 48a4b0f68d..ce69f49c89 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -63,6 +63,7 @@ import bigframes.core.reshape import bigframes.core.tools import bigframes.dataframe +import bigframes.enums import bigframes.operations as ops import bigframes.series import bigframes.session @@ -423,7 +424,13 @@ def read_csv( Union[MutableSequence[Any], numpy.ndarray[Any, Any], Tuple[Any, ...], range] ] = None, index_col: Optional[ - Union[int, str, Sequence[Union[str, int]], Literal[False]] + Union[ + int, + str, + Sequence[Union[str, int]], + bigframes.enums.DefaultIndexKind, + Literal[False], + ] ] = None, usecols: Optional[ Union[ @@ -491,7 +498,7 @@ def read_json( def read_gbq( query_or_table: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, @@ -529,7 +536,7 @@ def read_gbq_model(model_name: str): def read_gbq_query( query: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, @@ -555,7 +562,7 @@ def read_gbq_query( def read_gbq_table( query: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), max_results: Optional[int] = None, filters: vendored_pandas_gbq.FiltersType = (), diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0f5aa19592..6b84d838cf 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -294,7 +294,7 @@ def read_gbq( self, query_or_table: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, @@ -313,6 +313,9 @@ def read_gbq( filters = list(filters) if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table): + # TODO(b/338111344): This appears to be missing index_cols, which + # are necessary to be selected. + # TODO(b/338039517): Also, need to account for primary keys. query_or_table = self._to_query(query_or_table, columns, filters) if _is_query(query_or_table): @@ -326,9 +329,6 @@ def read_gbq( use_cache=use_cache, ) else: - # TODO(swast): Query the snapshot table but mark it as a - # deterministic query so we can avoid serializing if we have a - # unique index. if configuration is not None: raise ValueError( "The 'configuration' argument is not allowed when " @@ -359,6 +359,8 @@ def _to_query( else f"`{query_or_table}`" ) + # TODO(b/338111344): Generate an index based on DefaultIndexKind if we + # don't have index columns specified. select_clause = "SELECT " + ( ", ".join(f"`{column}`" for column in columns) if columns else "*" ) @@ -488,7 +490,7 @@ def read_gbq_query( self, query: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, @@ -566,7 +568,7 @@ def _read_gbq_query( self, query: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, @@ -598,7 +600,9 @@ def _read_gbq_query( True if use_cache is None else use_cache ) - if isinstance(index_col, str): + if isinstance(index_col, bigframes.enums.DefaultIndexKind): + index_cols = [] + elif isinstance(index_col, str): index_cols = [index_col] else: index_cols = list(index_col) @@ -628,7 +632,7 @@ def _read_gbq_query( return self.read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", - index_col=index_cols, + index_col=index_col, columns=columns, max_results=max_results, use_cache=configuration["query"]["useQueryCache"], @@ -638,7 +642,7 @@ def read_gbq_table( self, query: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), max_results: Optional[int] = None, filters: third_party_pandas_gbq.FiltersType = (), @@ -693,7 +697,7 @@ def _read_gbq_table( self, query: str, *, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), max_results: Optional[int] = None, api_name: str, @@ -821,10 +825,12 @@ def _read_bigquery_load_job( table: Union[bigquery.Table, bigquery.TableReference], *, job_config: bigquery.LoadJobConfig, - index_col: Iterable[str] | str = (), + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), ) -> dataframe.DataFrame: - if isinstance(index_col, str): + if isinstance(index_col, bigframes.enums.DefaultIndexKind): + index_cols = [] + elif isinstance(index_col, str): index_cols = [index_col] else: index_cols = list(index_col) @@ -1113,7 +1119,13 @@ def read_csv( Union[MutableSequence[Any], np.ndarray[Any, Any], Tuple[Any, ...], range] ] = None, index_col: Optional[ - Union[int, str, Sequence[Union[str, int]], Literal[False]] + Union[ + int, + str, + Sequence[Union[str, int]], + bigframes.enums.DefaultIndexKind, + Literal[False], + ] ] = None, usecols: Optional[ Union[ @@ -1143,18 +1155,37 @@ def read_csv( f"{constants.FEEDBACK_LINK}" ) - if index_col is not None and ( - not index_col or not isinstance(index_col, str) + # TODO(b/338089659): Looks like we can relax this 1 column + # restriction if we check the contents of an iterable are strings + # not integers. + if ( + # Empty tuples, None, and False are allowed and falsey. + index_col + and not isinstance(index_col, bigframes.enums.DefaultIndexKind) + and not isinstance(index_col, str) ): raise NotImplementedError( - "BigQuery engine only supports a single column name for `index_col`. " - f"{constants.FEEDBACK_LINK}" + "BigQuery engine only supports a single column name for `index_col`, " + f"got: {repr(index_col)}. {constants.FEEDBACK_LINK}" ) - # None value for index_col cannot be passed to read_gbq - if index_col is None: + # None and False cannot be passed to read_gbq. + # TODO(b/338400133): When index_col is None, we should be using the + # first column of the CSV as the index to be compatible with the + # pandas engine. According to the pandas docs, only "False" + # indicates a default sequential index. + if not index_col: index_col = () + index_col = typing.cast( + Union[ + Sequence[str], # Falsey values + bigframes.enums.DefaultIndexKind, + str, + ], + index_col, + ) + # usecols should only be an iterable of strings (column names) for use as columns in read_gbq. columns: Tuple[Any, ...] = tuple() if usecols is not None: @@ -1199,6 +1230,11 @@ def read_csv( columns=columns, ) else: + if isinstance(index_col, bigframes.enums.DefaultIndexKind): + raise NotImplementedError( + f"With index_col={repr(index_col)}, only engine='bigquery' is supported. " + f"{constants.FEEDBACK_LINK}" + ) if any(arg in kwargs for arg in ("chunksize", "iterator")): raise NotImplementedError( "'chunksize' and 'iterator' arguments are not supported. " diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 3235ca92e5..29d5a5567f 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -35,6 +35,7 @@ import bigframes import bigframes.clients +import bigframes.constants import bigframes.core as core import bigframes.core.compile import bigframes.core.guid as guid @@ -206,12 +207,35 @@ def _get_primary_keys( return primary_keys +def _is_table_clustered_or_partitioned( + table: bigquery.table.Table, +) -> bool: + """Returns True if the table is clustered or partitioned.""" + + # Could be None or an empty tuple if it's not clustered, both of which are + # falsey. + if table.clustering_fields: + return True + + if ( + time_partitioning := table.time_partitioning + ) is not None and time_partitioning.type_ is not None: + return True + + if ( + range_partitioning := table.range_partitioning + ) is not None and range_partitioning.field is not None: + return True + + return False + + def get_index_cols_and_uniqueness( bqclient: bigquery.Client, ibis_client: ibis.BaseBackend, table: bigquery.table.Table, table_expression: ibis_types.Table, - index_col: Iterable[str] | str, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind, api_name: str, ) -> Tuple[List[str], bool]: """ @@ -222,7 +246,23 @@ def get_index_cols_and_uniqueness( # Transform index_col -> index_cols so we have a variable that is # always a list of column names (possibly empty). - if isinstance(index_col, str): + if isinstance(index_col, bigframes.enums.DefaultIndexKind): + if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: + # User has explicity asked for a default, sequential index. + # Use that, even if there are primary keys on the table. + # + # Note: This relies on the default behavior of the Block + # constructor to create a default sequential index. If that ever + # changes, this logic will need to be revisited. + return [], False + else: + # Note: It's actually quite difficult to mock this out to unit + # test, as it's not possible to subclass enums in Python. See: + # https://stackoverflow.com/a/33680021/101923 + raise NotImplementedError( + f"Got unexpected index_col {repr(index_col)}. {bigframes.constants.FEEDBACK_LINK}" + ) + elif isinstance(index_col, str): index_cols: List[str] = [index_col] else: index_cols = list(index_col) @@ -230,14 +270,26 @@ def get_index_cols_and_uniqueness( # If the isn't an index selected, use the primary keys of the table as the # index. If there are no primary keys, we'll return an empty list. if len(index_cols) == 0: - index_cols = _get_primary_keys(table) - - # TODO(b/335727141): If table has clustering/partitioning, fail if - # index_cols is empty. + primary_keys = _get_primary_keys(table) + + # If table has clustering/partitioning, fail if we haven't been able to + # find index_cols to use. This is to avoid unexpected performance and + # resource utilization because of the default sequential index. See + # internal issue 335727141. + if _is_table_clustered_or_partitioned(table) and not primary_keys: + raise bigframes.exceptions.NoDefaultIndexError( + f"Table '{str(table.reference)}' is clustered and/or " + "partitioned, but BigQuery DataFrames was not able to find a " + "suitable index. To avoid this error, set at least one of: " + # TODO(b/338037499): Allow max_results to override this too, + # once we make it more efficient. + "`index_col` or `filters`." + ) # If there are primary keys defined, the query engine assumes these # columns are unique, even if the constraint is not enforced. We make # the same assumption and use these columns as the total ordering keys. + index_cols = primary_keys is_index_unique = len(index_cols) != 0 else: is_index_unique = _check_index_uniqueness( diff --git a/docs/reference/bigframes/enums.rst b/docs/reference/bigframes/enums.rst new file mode 100644 index 0000000000..b0a198e184 --- /dev/null +++ b/docs/reference/bigframes/enums.rst @@ -0,0 +1,8 @@ + +===== +Enums +===== + +.. automodule:: bigframes.enums + :members: + :undoc-members: diff --git a/docs/reference/bigframes/exceptions.rst b/docs/reference/bigframes/exceptions.rst new file mode 100644 index 0000000000..c471aecdf7 --- /dev/null +++ b/docs/reference/bigframes/exceptions.rst @@ -0,0 +1,8 @@ + +======================= +Exceptions and Warnings +======================= + +.. automodule:: bigframes.exceptions + :members: + :undoc-members: diff --git a/docs/reference/bigframes/index.rst b/docs/reference/bigframes/index.rst index d26db18c96..f56883dc8e 100644 --- a/docs/reference/bigframes/index.rst +++ b/docs/reference/bigframes/index.rst @@ -6,6 +6,8 @@ Core objects .. toctree:: :maxdepth: 2 + enums + exceptions options diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 80ccc01fac..67e628eb7d 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -32,6 +32,10 @@ - name: Session uid: bigframes.session.Session name: Session + - name: Enumerations + uid: bigframes.enums + - name: Exceptions and warnings + uid: bigframes.exceptions name: Core Objects - items: - name: DataFrame diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index f26902f084..f36dd64cbe 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -117,8 +117,8 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): @pytest.mark.parametrize( - ("index"), - [True, False], + ("index",), + [(True,), (False,)], ) def test_to_csv_index( scalars_dfs: Tuple[bigframes.dataframe.DataFrame, pd.DataFrame], diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 1e76a8bd8b..2779874d6c 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -524,7 +524,11 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" scalars_df.to_csv(path, index=False) - df = session.read_csv(path, engine="bigquery") + df = session.read_csv( + path, + engine="bigquery", + index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, + ) # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -629,44 +633,24 @@ def test_read_csv_localbuffer_bq_engine(session, scalars_dfs): pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) -@pytest.mark.parametrize( - ("kwargs", "match"), - [ - pytest.param( - {"engine": "bigquery", "names": []}, - "BigQuery engine does not support these arguments", - id="with_names", - ), - pytest.param( - {"engine": "bigquery", "dtype": {}}, - "BigQuery engine does not support these arguments", - id="with_dtype", - ), - pytest.param( - {"engine": "bigquery", "index_col": False}, - "BigQuery engine only supports a single column name for `index_col`.", - id="with_index_col_false", - ), - pytest.param( - {"engine": "bigquery", "index_col": 5}, - "BigQuery engine only supports a single column name for `index_col`.", - id="with_index_col_not_str", - ), - pytest.param( - {"engine": "bigquery", "usecols": [1, 2]}, - "BigQuery engine only supports an iterable of strings for `usecols`.", - id="with_usecols_invalid", - ), - pytest.param( - {"engine": "bigquery", "encoding": "ASCII"}, - "BigQuery engine only supports the following encodings", - id="with_encoding_invalid", - ), - ], -) -def test_read_csv_bq_engine_throws_not_implemented_error(session, kwargs, match): - with pytest.raises(NotImplementedError, match=match): - session.read_csv("", **kwargs) +def test_read_csv_bq_engine_supports_index_col_false( + session, scalars_df_index, gcs_folder +): + path = gcs_folder + "test_read_csv_bq_engine_supports_index_col_false*.csv" + read_path = utils.get_first_file_from_wildcard(path) + scalars_df_index.to_csv(path) + + df = session.read_csv( + read_path, + # Normally, pandas uses the first column as the index. index_col=False + # turns off that behavior. + index_col=False, + ) + assert df.shape[0] == scalars_df_index.shape[0] + + # We use a default index because of index_col=False, so the previous index + # column is just loaded as a column. + assert len(df.columns) == len(scalars_df_index.columns) + 1 @pytest.mark.parametrize( diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 34f185cafd..70a121435c 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import datetime import os import re @@ -23,10 +24,129 @@ import pytest import bigframes +import bigframes.enums import bigframes.exceptions from .. import resources +TABLE_REFERENCE = { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "my_table", +} +CLUSTERED_OR_PARTITIONED_TABLES = [ + pytest.param( + google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": TABLE_REFERENCE, + "clustering": { + "fields": ["col1", "col2"], + }, + }, + ), + id="clustered", + ), + pytest.param( + google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": TABLE_REFERENCE, + "rangePartitioning": { + "field": "col1", + "range": { + "start": 1, + "end": 100, + "interval": 1, + }, + }, + }, + ), + id="range-partitioned", + ), + pytest.param( + google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": TABLE_REFERENCE, + "timePartitioning": { + "type": "MONTH", + "field": "col1", + }, + }, + ), + id="time-partitioned", + ), + pytest.param( + google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": TABLE_REFERENCE, + "clustering": { + "fields": ["col1", "col2"], + }, + "timePartitioning": { + "type": "MONTH", + "field": "col1", + }, + }, + ), + id="time-partitioned-and-clustered", + ), +] + + +@pytest.mark.parametrize( + ("kwargs", "match"), + [ + pytest.param( + {"engine": "bigquery", "names": []}, + "BigQuery engine does not support these arguments", + id="with_names", + ), + pytest.param( + {"engine": "bigquery", "dtype": {}}, + "BigQuery engine does not support these arguments", + id="with_dtype", + ), + pytest.param( + {"engine": "bigquery", "index_col": 5}, + "BigQuery engine only supports a single column name for `index_col`.", + id="with_index_col_not_str", + ), + pytest.param( + {"engine": "bigquery", "usecols": [1, 2]}, + "BigQuery engine only supports an iterable of strings for `usecols`.", + id="with_usecols_invalid", + ), + pytest.param( + {"engine": "bigquery", "encoding": "ASCII"}, + "BigQuery engine only supports the following encodings", + id="with_encoding_invalid", + ), + ], +) +def test_read_csv_bq_engine_throws_not_implemented_error(kwargs, match): + session = resources.create_bigquery_session() + + with pytest.raises(NotImplementedError, match=match): + session.read_csv("", **kwargs) + + +@pytest.mark.parametrize( + ("engine",), + ( + ("c",), + ("python",), + ("pyarrow",), + ), +) +def test_read_csv_pandas_engines_index_col_sequential_int64_not_supported(engine): + session = resources.create_bigquery_session() + + with pytest.raises(NotImplementedError, match="index_col"): + session.read_csv( + "path/to/csv.csv", + engine=engine, + index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, + ) + @pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")]) def test_read_gbq_missing_parts(missing_parts_table_id): @@ -65,14 +185,109 @@ def get_table_mock(table_ref): assert "1999-01-02T03:04:05.678901" in df.sql -def test_read_gbq_clustered_table_ok_default_index_with_primary_key(): +@pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) +def test_no_default_index_error_raised_by_read_gbq(table): + """Because of the windowing operation to create a default index, row + filters can't push down to the clustering column. + + Raise an exception in this case so that the user is directed to supply a + unique index column or filter if possible. + + See internal issue 335727141. + """ + table = copy.deepcopy(table) + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.return_value = table + session = resources.create_bigquery_session(bqclient=bqclient) + table._properties["location"] = session._location + + with pytest.raises(bigframes.exceptions.NoDefaultIndexError): + session.read_gbq("my-project.my_dataset.my_table") + + +@pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) +def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int64( + table, +): + """Because of the windowing operation to create a default index, row + filters can't push down to the clustering column. + + Allow people to use the default index only if they explicitly request it. + + See internal issue 335727141. + """ + table = copy.deepcopy(table) + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.return_value = table + session = resources.create_bigquery_session(bqclient=bqclient) + table._properties["location"] = session._location + + # No exception raised because we set the option allowing the default indexes. + df = session.read_gbq( + "my-project.my_dataset.my_table", + index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, + ) + + # We expect a window operation because we specificaly requested a sequential index. + generated_sql = df.sql.casefold() + assert "OVER".casefold() in generated_sql + assert "ROW_NUMBER()".casefold() in generated_sql + + +@pytest.mark.parametrize( + ("total_count", "distinct_count"), + ( + (0, 0), + (123, 123), + # Should still have a positive effect, even if the index is not unique. + (123, 111), + ), +) +@pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) +def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( + total_count, + distinct_count, + table, +): + table = copy.deepcopy(table) + table.schema = ( + google.cloud.bigquery.SchemaField("idx_1", "INT64"), + google.cloud.bigquery.SchemaField("idx_2", "INT64"), + google.cloud.bigquery.SchemaField("col_1", "INT64"), + google.cloud.bigquery.SchemaField("col_2", "INT64"), + ) + + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.return_value = table + bqclient.query_and_wait.return_value = ( + {"total_count": total_count, "distinct_count": distinct_count}, + ) + session = resources.create_bigquery_session( + bqclient=bqclient, table_schema=table.schema + ) + table._properties["location"] = session._location + + # No exception raised because there are columns to use as the index. + df = session.read_gbq( + "my-project.my_dataset.my_table", index_col=("idx_1", "idx_2") + ) + + # There should be no analytic operators to prevent row filtering pushdown. + assert "OVER" not in df.sql + assert tuple(df.index.names) == ("idx_1", "idx_2") + + +@pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) +def test_no_default_index_error_not_raised_by_read_gbq_primary_key(table): """If a primary key is set on the table, we use that as the index column by default, no error should be raised in this case. See internal issue 335727141. """ - table = google.cloud.bigquery.Table("my-project.my_dataset.my_table") - table.clustering_fields = ["col1", "col2"] + table = copy.deepcopy(table) table.schema = ( google.cloud.bigquery.SchemaField("pk_1", "INT64"), google.cloud.bigquery.SchemaField("pk_2", "INT64"), @@ -95,6 +310,7 @@ def test_read_gbq_clustered_table_ok_default_index_with_primary_key(): ) table._properties["location"] = session._location + # No exception raised because there is a primary key to use as the index. df = session.read_gbq("my-project.my_dataset.my_table") # There should be no analytic operators to prevent row filtering pushdown. diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 93cee71289..c25dd8776f 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -6,6 +6,7 @@ from typing import Any, Dict, Iterable, Literal, Optional, Tuple, Union from bigframes import constants +import bigframes.enums FilterOps = Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">", "LIKE"] FilterType = Tuple[str, FilterOps, Any] @@ -17,7 +18,7 @@ def read_gbq( self, query_or_table: str, *, - index_col: Iterable[str] | str = (), + index_col: Union[Iterable[str], str, bigframes.enums.DefaultIndexKind] = (), columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, @@ -28,16 +29,23 @@ def read_gbq( """Loads a DataFrame from BigQuery. BigQuery tables are an unordered, unindexed data source. To add support - pandas-compatibility, the following indexing options are supported: - - * (Default behavior) Add an arbitrary sequential index and ordering - using an an analytic windowed operation that prevents filtering - push down. + pandas-compatibility, the following indexing options are supported via + the ``index_col`` parameter: + + * (Empty iterable, default) A default index. **Behavior may change.** + Explicitly set ``index_col`` if your application makes use of + specific index values. + + If a table has primary key(s), those are used as the index, + otherwise a sequential index is generated. + * (:attr:`bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64`) Add an + arbitrary sequential index and ordering. **Warning** This uses an + analytic windowed operation that prevents filtering push down. Avoid + using on large clustered or partitioned tables. * (Recommended) Set the ``index_col`` argument to one or more columns. Unique values for the row labels are recommended. Duplicate labels are possible, but note that joins on a non-unique index can duplicate - rows and operations like ``cumsum()`` that window across a non-unique - index can have some non-deternimism. + rows via pandas-like outer join behavior. .. note:: By default, even SQL query inputs with an ORDER BY clause create a @@ -107,11 +115,18 @@ def read_gbq( `project.dataset.tablename` or `dataset.tablename`. Can also take wildcard table name, such as `project.dataset.table_prefix*`. In tha case, will read all the matched table as one DataFrame. - index_col (Iterable[str] or str): + index_col (Iterable[str], str, bigframes.enums.DefaultIndexKind): Name of result column(s) to use for index in results DataFrame. + If an empty iterable, such as ``()``, a default index is + generated. Do not depend on specific index values in this case. + **New in bigframes version 1.3.0**: If ``index_cols`` is not set, the primary key(s) of the table are used as the index. + + **New in bigframes version 1.4.0**: Support + :class:`bigframes.enums.DefaultIndexKind` to override default index + behavior. columns (Iterable[str]): List of BigQuery column names in the desired order for results DataFrame. @@ -141,6 +156,11 @@ def read_gbq( col_order (Iterable[str]): Alias for columns, retained for backwards compatibility. + Raises: + bigframes.exceptions.NoDefaultIndexError: + Using the default index is discouraged, such as with clustered + or partitioned tables without primary keys. + Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. """ diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index e8ed6182a6..d147abfd22 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -21,6 +21,7 @@ import numpy as np from bigframes import constants +import bigframes.enums class ReaderIOMixin: @@ -34,7 +35,13 @@ def read_csv( Union[MutableSequence[Any], np.ndarray[Any, Any], Tuple[Any, ...], range] ] = None, index_col: Optional[ - Union[int, str, Sequence[Union[str, int]], Literal[False]] + Union[ + int, + str, + Sequence[Union[str, int]], + bigframes.enums.DefaultIndexKind, + Literal[False], + ] ] = None, usecols=None, dtype: Optional[Dict] = None, From c8d4e231fe8263f5b10fae9b879ff82df58da534 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 2 May 2024 18:10:19 -0700 Subject: [PATCH 09/17] feat: Add a unique session_id to Session and allow cleaning up sessions (#553) - temporary tables will have the session id in their names - session.close() will delete temporary tables that were created using the session. - add pandas.get_default_session_id() - add manual_cleanup_by_session_id(session_id: str). this is slow but allows users to clean up if they lost the session object --- bigframes/core/global_session.py | 20 +++- bigframes/dataframe.py | 6 +- bigframes/exceptions.py | 4 + bigframes/pandas/__init__.py | 62 +++++++++++ bigframes/session/__init__.py | 84 ++++++++++----- bigframes/session/_io/bigquery/__init__.py | 102 +++++++++++++----- tests/system/conftest.py | 14 ++- tests/system/large/test_session.py | 81 +++++++++++++- tests/system/small/test_encryption.py | 8 +- tests/system/small/test_pandas_options.py | 11 +- tests/unit/session/test_io_bigquery.py | 10 +- .../bigframes_vendored/pandas/core/frame.py | 2 +- .../pandas/plotting/_core.py | 2 + 13 files changed, 330 insertions(+), 76 deletions(-) diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 1f960839a0..31dfc9bd17 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -15,7 +15,11 @@ """Utilities for managing a default, globally available Session object.""" import threading +import traceback from typing import Callable, Optional, TypeVar +import warnings + +import google.auth.exceptions import bigframes._config import bigframes.session @@ -27,7 +31,8 @@ def close_session() -> None: """Start a fresh session the next time a function requires a session. - Closes the current session if it was already started. + Closes the current session if it was already started, deleting any + temporary tables that were created. Returns: None @@ -36,7 +41,18 @@ def close_session() -> None: with _global_session_lock: if _global_session is not None: - _global_session.close() + try: + _global_session.close() + except google.auth.exceptions.RefreshError as e: + session_id = _global_session.session_id + location = _global_session._location + project_id = _global_session._project + warnings.warn( + f"Session cleanup failed for session with id: {session_id}, " + f"location: {location}, project: {project_id}", + category=bigframes.exceptions.CleanupFailedWarning, + ) + traceback.print_tb(e.__traceback__) _global_session = None bigframes._config.options.bigquery._session_started = False diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 092c8ab82f..d694216ebe 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2928,8 +2928,10 @@ def to_gbq( ) if_exists = "replace" - temp_table_ref = bigframes.session._io.bigquery.random_table( - self._session._anonymous_dataset + temp_table_ref = self._session._random_table( + # The client code owns this table reference now, so skip_cleanup=True + # to not clean it up when we close the session. + skip_cleanup=True, ) destination_table = f"{temp_table_ref.project}.{temp_table_ref.dataset_id}.{temp_table_ref.table_id}" diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index d179914983..222df069f6 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -21,5 +21,9 @@ class UnknownLocationWarning(Warning): """The location is set to an unknown value.""" +class CleanupFailedWarning(Warning): + """Bigframes failed to clean up a table resource.""" + + class NoDefaultIndexError(ValueError): """Unable to create a default index.""" diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index ce69f49c89..fc87bec3df 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -706,6 +706,68 @@ def to_datetime( to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ +def get_default_session_id() -> str: + """Gets the session id that is used whenever a custom session + has not been provided. + + It is the session id of the default global session. It is prefixed to + the table id of all temporary tables created in the global session. + + Returns: + str, the default global session id, ex. 'sessiona1b2c' + """ + return get_global_session().session_id + + +def clean_up_by_session_id( + session_id: str, + location: Optional[str] = None, + project: Optional[str] = None, +) -> None: + """Searches through table names in BigQuery and deletes tables + found matching the expected format. + + This could be useful if the session object has been lost. + Calling `session.close()` or `bigframes.pandas.close_session()` + is preferred in most cases. + + Args: + session_id (str): + The session id to clean up. Can be found using + session.session_id or get_default_session_id(). + + location (str, default None): + The location of the session to clean up. If given, used + together with project kwarg to determine the dataset + to search through for tables to clean up. + + project (str, default None): + The project id associated with the session to clean up. + If given, used together with location kwarg to determine + the dataset to search through for tables to clean up. + + Returns: + None + """ + session = get_global_session() + client = session.bqclient + + if (location is None) != (project is None): + raise ValueError( + "Only one of project or location was given. Must specify both or neither." + ) + elif location is None and project is None: + dataset = session._anonymous_dataset + else: + dataset = bigframes.session._io.bigquery.create_bq_dataset_reference( + client, location=location, project=project + ) + + bigframes.session._io.bigquery.delete_tables_matching_session_id( + client, dataset, session_id + ) + + # pandas dtype attributes NA = pandas.NA BooleanDtype = pandas.BooleanDtype diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6b84d838cf..2425369edd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -21,6 +21,7 @@ import logging import os import re +import secrets import typing from typing import ( Any, @@ -37,6 +38,7 @@ Tuple, Union, ) +import uuid import warnings # Even though the ibis.backends.bigquery import is unused, it's needed @@ -100,6 +102,8 @@ _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" +_TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}" + _MAX_CLUSTER_COLUMNS = 4 # TODO(swast): Need to connect to regional endpoints when performing remote @@ -203,7 +207,11 @@ def __init__( bq_kms_key_name=self._bq_kms_key_name, ) - self._create_bq_datasets() + self._anonymous_dataset = ( + bigframes.session._io.bigquery.create_bq_dataset_reference( + self.bqclient, location=self._location + ) + ) # TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494 # has been fixed. The ibis client changes the default query job config @@ -233,6 +241,13 @@ def __init__( bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table] ] = {} + # unique session identifier, short enough to be human readable + # only needs to be unique among sessions created by the same user + # at the same time in the same region + self._session_id: str = "session" + secrets.token_hex(3) + self._table_ids: List[str] = [] + # store table ids and delete them when the session is closed + @property def bqclient(self): return self._clients_provider.bqclient @@ -263,6 +278,10 @@ def bqconnectionmanager(self): ) return self._bq_connection_manager + @property + def session_id(self): + return self._session_id + @property def _project(self): return self.bqclient.project @@ -271,24 +290,15 @@ def __hash__(self): # Stable hash needed to use in expression tree return hash(str(self._anonymous_dataset)) - def _create_bq_datasets(self): - """Create and identify dataset(s) for temporary BQ resources.""" - query_job = self.bqclient.query("SELECT 1", location=self._location) - query_job.result() # blocks until finished - - # The anonymous dataset is used by BigQuery to write query results and - # session tables. BigQuery DataFrames also writes temp tables directly - # to the dataset, no BigQuery Session required. Note: there is a - # different anonymous dataset per location. See: - # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored - query_destination = query_job.destination - self._anonymous_dataset = bigquery.DatasetReference( - query_destination.project, - query_destination.dataset_id, - ) - def close(self): - """No-op. Temporary resources are deleted after 7 days.""" + """Delete tables that were created with this session's session_id.""" + client = self.bqclient + project_id = self._anonymous_dataset.project + dataset_id = self._anonymous_dataset.dataset_id + + for table_id in self._table_ids: + full_id = ".".join([project_id, dataset_id, table_id]) + client.delete_table(full_id, not_found_ok=True) def read_gbq( self, @@ -1063,7 +1073,7 @@ def _read_pandas_load_job( job_config.labels = {"bigframes-api": api_name} - load_table_destination = bigframes_io.random_table(self._anonymous_dataset) + load_table_destination = self._random_table() load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -1145,7 +1155,7 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ) -> dataframe.DataFrame: - table = bigframes_io.random_table(self._anonymous_dataset) + table = self._random_table() if engine is not None and engine == "bigquery": if any(param is not None for param in (dtype, names)): @@ -1282,7 +1292,7 @@ def read_parquet( *, engine: str = "auto", ) -> dataframe.DataFrame: - table = bigframes_io.random_table(self._anonymous_dataset) + table = self._random_table() if engine == "bigquery": job_config = self._prepare_load_job_config() @@ -1319,7 +1329,7 @@ def read_json( engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", **kwargs, ) -> dataframe.DataFrame: - table = bigframes_io.random_table(self._anonymous_dataset) + table = self._random_table() if engine == "bigquery": @@ -1416,14 +1426,12 @@ def _create_empty_temp_table( ) -> bigquery.TableReference: # Can't set a table in _SESSION as destination via query job API, so we # run DDL, instead. - dataset = self._anonymous_dataset expiration = ( datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION ) table = bigframes_io.create_temp_table( - self.bqclient, - dataset, + self, expiration, schema=schema, cluster_columns=cluster_cols, @@ -1939,6 +1947,32 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() + def _random_table(self, skip_cleanup: bool = False) -> bigquery.TableReference: + """Generate a random table ID with BigQuery DataFrames prefix. + + The generated ID will be stored and checked for deletion when the + session is closed, unless skip_cleanup is True. + + Args: + skip_cleanup (bool, default False): + If True, do not add the generated ID to the list of tables + to clean up when the session is closed. + + Returns: + google.cloud.bigquery.TableReference: + Fully qualified table ID of a table that doesn't exist. + """ + dataset = self._anonymous_dataset + session_id = self.session_id + now = datetime.datetime.now(datetime.timezone.utc) + random_id = uuid.uuid4().hex + table_id = _TEMP_TABLE_ID_FORMAT.format( + date=now.strftime("%Y%m%d"), session_id=session_id, random_id=random_id + ) + if not skip_cleanup: + self._table_ids.append(table_id) + return dataset.table(table_id) + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 2cd2d8ff9a..79108c71a2 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -22,7 +22,6 @@ import textwrap import types from typing import Dict, Iterable, Optional, Sequence, Tuple, Union -import uuid import google.api_core.exceptions import google.cloud.bigquery as bigquery @@ -33,7 +32,8 @@ IO_ORDERING_ID = "bqdf_row_nums" MAX_LABELS_COUNT = 64 -TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}" +_LIST_TABLES_LIMIT = 10000 # calls to bqclient.list_tables +# will be limited to this many tables LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -98,39 +98,25 @@ def create_export_data_statement( ) -def random_table(dataset: bigquery.DatasetReference) -> bigquery.TableReference: - """Generate a random table ID with BigQuery DataFrames prefix. - Args: - dataset (google.cloud.bigquery.DatasetReference): - The dataset to make the table reference in. Usually the anonymous - dataset for the session. - Returns: - google.cloud.bigquery.TableReference: - Fully qualified table ID of a table that doesn't exist. - """ - now = datetime.datetime.now(datetime.timezone.utc) - random_id = uuid.uuid4().hex - table_id = TEMP_TABLE_PREFIX.format( - date=now.strftime("%Y%m%d"), random_id=random_id - ) - return dataset.table(table_id) - - def table_ref_to_sql(table: bigquery.TableReference) -> str: """Format a table reference as escaped SQL.""" return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" def create_temp_table( - bqclient: bigquery.Client, - dataset: bigquery.DatasetReference, + session: bigframes.session.Session, expiration: datetime.datetime, *, schema: Optional[Iterable[bigquery.SchemaField]] = None, cluster_columns: Optional[list[str]] = None, ) -> str: - """Create an empty table with an expiration in the desired dataset.""" - table_ref = random_table(dataset) + """Create an empty table with an expiration in the desired session. + + The table will be deleted when the session is closed or the expiration + is reached. + """ + bqclient: bigquery.Client = session.bqclient + table_ref = session._random_table() destination = bigquery.Table(table_ref) destination.expires = expiration destination.schema = schema @@ -257,3 +243,71 @@ def pytest_log_job(query_job: bigquery.QueryJob): bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") with open(bytes_file, "a") as f: f.write(str(bytes_processed) + "\n") + + +def delete_tables_matching_session_id( + client: bigquery.Client, dataset: bigquery.DatasetReference, session_id: str +) -> None: + """Searches within the dataset for tables conforming to the + expected session_id form, and instructs bigquery to delete them. + + Args: + client (bigquery.Client): + The client to use to list tables + dataset (bigquery.DatasetReference): + The dataset to search in + session_id (str): + The session id to match on in the table name + + Returns: + None + """ + + tables = client.list_tables( + dataset, max_results=_LIST_TABLES_LIMIT, page_size=_LIST_TABLES_LIMIT + ) + for table in tables: + split_id = table.table_id.split("_") + if not split_id[0].startswith("bqdf") or len(split_id) < 2: + continue + found_session_id = split_id[1] + if found_session_id == session_id: + client.delete_table(table, not_found_ok=True) + print("Deleting temporary table '{}'.".format(table.table_id)) + + +def create_bq_dataset_reference( + bq_client: bigquery.Client, location=None, project=None +) -> bigquery.DatasetReference: + """Create and identify dataset(s) for temporary BQ resources. + + bq_client project and location will be used unless kwargs "project" + and/or "location" are given. If given, location and project + will be passed through to + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Args: + bq_client (bigquery.Client): + The bigquery.Client to use for the http request to + create the dataset reference. + location (str, default None): + The location of the project to create the dataset in. + project (str, default None): + The project id of the project to create the dataset in. + + Returns: + bigquery.DatasetReference: The constructed reference to the anonymous dataset. + """ + query_job = bq_client.query("SELECT 1", location=location, project=project) + query_job.result() # blocks until finished + + # The anonymous dataset is used by BigQuery to write query results and + # session tables. BigQuery DataFrames also writes temp tables directly + # to the dataset, no BigQuery Session required. Note: there is a + # different anonymous dataset per location. See: + # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored + query_destination = query_job.destination + return bigquery.DatasetReference( + query_destination.project, + query_destination.dataset_id, + ) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index a9fb8cb3f5..4ebb3cb93b 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -19,7 +19,7 @@ import pathlib import textwrap import typing -from typing import Dict, Optional +from typing import Dict, Generator, Optional import google.api_core.exceptions import google.cloud.bigquery as bigquery @@ -127,19 +127,23 @@ def resourcemanager_client( @pytest.fixture(scope="session") -def session() -> bigframes.Session: +def session() -> Generator[bigframes.Session, None, None]: context = bigframes.BigQueryOptions( location="US", ) - return bigframes.Session(context=context) + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup time @pytest.fixture(scope="session") -def session_tokyo(tokyo_location: str) -> bigframes.Session: +def session_tokyo(tokyo_location: str) -> Generator[bigframes.Session, None, None]: context = bigframes.BigQueryOptions( location=tokyo_location, ) - return bigframes.Session(context=context) + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup type @pytest.fixture(scope="session", autouse=True) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index 62fa5a83d3..c7a19dc26e 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + +import google.cloud.exceptions import pytest -from bigframes import Session +import bigframes +import bigframes.pandas as bpd @pytest.mark.parametrize( @@ -46,7 +50,80 @@ # ), ], ) -def test_read_gbq_for_large_tables(session: Session, query_or_table, index_col): +def test_read_gbq_for_large_tables( + session: bigframes.Session, query_or_table, index_col +): """Verify read_gbq() is able to read large tables.""" df = session.read_gbq(query_or_table, index_col=index_col) assert len(df.columns) != 0 + + +def test_close(session): + # we will create two tables and confirm that they are deleted + # when the session is closed + + bqclient = session.bqclient + + expiration = ( + datetime.datetime.now(datetime.timezone.utc) + + bigframes.constants.DEFAULT_EXPIRATION + ) + full_id_1 = bigframes.session._io.bigquery.create_temp_table(session, expiration) + full_id_2 = bigframes.session._io.bigquery.create_temp_table(session, expiration) + + # check that the tables were actually created + assert bqclient.get_table(full_id_1).created is not None + assert bqclient.get_table(full_id_2).created is not None + + session.close() + + # check that the tables are already deleted + with pytest.raises(google.cloud.exceptions.NotFound): + bqclient.delete_table(full_id_1) + with pytest.raises(google.cloud.exceptions.NotFound): + bqclient.delete_table(full_id_2) + + +def test_clean_up_by_session_id(): + # we do this test in a different region in order to avoid + # overly large amounts of temp tables slowing the test down + option_context = bigframes.BigQueryOptions() + option_context.location = "europe-west10" + session = bigframes.Session(context=option_context) + session_id = session.session_id + + # we will create two tables and confirm that they are deleted + # when the session is closed by id + + bqclient = session.bqclient + dataset = session._anonymous_dataset + expiration = ( + datetime.datetime.now(datetime.timezone.utc) + + bigframes.constants.DEFAULT_EXPIRATION + ) + bigframes.session._io.bigquery.create_temp_table(session, expiration) + bigframes.session._io.bigquery.create_temp_table(session, expiration) + + # check that some table exists with the expected session_id + tables_before = bqclient.list_tables( + dataset, + max_results=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, + page_size=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, + ) + assert any( + [(session.session_id in table.full_table_id) for table in list(tables_before)] + ) + + bpd.clean_up_by_session_id( + session_id, location=session._location, project=session._project + ) + + # check that no tables with the session_id are left after cleanup + tables_after = bqclient.list_tables( + dataset, + max_results=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, + page_size=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, + ) + assert not any( + [(session.session_id in table.full_table_id) for table in list(tables_after)] + ) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index eae667dc9d..fcaca7a493 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -86,9 +86,7 @@ def test_session_load_job(bq_cmek, session_with_bq_cmek): pytest.skip("no cmek set for testing") # pragma: NO COVER # Session should have cmek set in the default query and load job configs - load_table = bigframes.session._io.bigquery.random_table( - session_with_bq_cmek._anonymous_dataset - ) + load_table = session_with_bq_cmek._random_table() df = pandas.DataFrame({"col0": [1, 2, 3]}) load_job_config = session_with_bq_cmek._prepare_load_job_config() @@ -188,9 +186,7 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): # Write the result to BQ custom table and assert encryption session_with_bq_cmek.bqclient.get_table(output_table_id) - output_table_ref = bigframes.session._io.bigquery.random_table( - session_with_bq_cmek._anonymous_dataset - ) + output_table_ref = session_with_bq_cmek._random_table() output_table_id = str(output_table_ref) df.to_gbq(output_table_id) output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index c410d70fe7..dd13196981 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -14,6 +14,7 @@ import datetime from unittest import mock +import warnings import google.api_core.exceptions import google.auth @@ -254,7 +255,7 @@ def test_read_gbq_must_comply_with_set_location_non_US( assert df is not None -def test_close_session_after_credentials_need_reauthentication(monkeypatch): +def test_credentials_need_reauthentication(monkeypatch): # Use a simple test query to verify that default session works to interact # with BQ test_query = "SELECT 1" @@ -288,8 +289,12 @@ def test_close_session_after_credentials_need_reauthentication(monkeypatch): with pytest.raises(google.auth.exceptions.RefreshError): bpd.read_gbq(test_query) - # Now verify that closing the session works - bpd.close_session() + # Now verify that closing the session works and we throw + # the expected warning + with warnings.catch_warnings(record=True) as warned: + bpd.close_session() # CleanupFailedWarning: can't clean up + assert len(warned) == 1 + assert warned[0].category == bigframes.exceptions.CleanupFailedWarning assert bigframes.core.global_session._global_session is None # Now verify that use is able to start over diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index eed1acb5a3..43865fc2c8 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -14,7 +14,6 @@ import datetime from typing import Iterable -import unittest.mock as mock import google.cloud.bigquery as bigquery import pytest @@ -139,16 +138,15 @@ def test_create_job_configs_labels_length_limit_met(): def test_create_temp_table_default_expiration(): """Make sure the created table has an expiration.""" - bqclient = mock.create_autospec(bigquery.Client) - dataset = bigquery.DatasetReference("test-project", "test_dataset") expiration = datetime.datetime( 2023, 11, 2, 13, 44, 55, 678901, datetime.timezone.utc ) - bigframes.session._io.bigquery.create_temp_table(bqclient, dataset, expiration) + session = resources.create_bigquery_session() + bigframes.session._io.bigquery.create_temp_table(session, expiration) - bqclient.create_table.assert_called_once() - call_args = bqclient.create_table.call_args + session.bqclient.create_table.assert_called_once() + call_args = session.bqclient.create_table.call_args table = call_args.args[0] assert table.project == "test-project" assert table.dataset_id == "test_dataset" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c5168cd160..4e17bca54d 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -411,7 +411,7 @@ def to_gbq( >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> destination = df.to_gbq(ordering_id="ordering_id") >>> # The table created can be read outside of the current session. - >>> bpd.close_session() # For demonstration, only. + >>> bpd.close_session() # Optional, to demonstrate a new session. >>> bpd.read_gbq(destination, index_col="ordering_id") col1 col2 ordering_id diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index bf016357a6..2409068fa8 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -57,6 +57,7 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> import numpy as np >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) @@ -235,6 +236,7 @@ def scatter( in a DataFrame's columns. >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], ... [6.4, 3.2, 1], [5.9, 3.0, 2]], ... columns=['length', 'width', 'species']) From 36578ab431119f71dda746de415d0c6417bb4de2 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 3 May 2024 02:33:46 +0000 Subject: [PATCH 10/17] feat: support gcf max instance count in `remote_function` (#657) * feat: support gcf max instance count in `remote_function` * fix comment in test * enable back the retry annotation --- bigframes/functions/remote_function.py | 32 ++++++++++++++--- bigframes/pandas/__init__.py | 2 ++ bigframes/session/__init__.py | 10 ++++++ tests/system/large/test_remote_function.py | 40 ++++++++++++++++++++++ 4 files changed, 79 insertions(+), 5 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index f7237c564c..9d826d0fa1 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -342,7 +342,12 @@ def generate_cloud_function_code(self, def_, dir, package_requirements=None): return entry_point def create_cloud_function( - self, def_, cf_name, package_requirements=None, cloud_function_timeout=600 + self, + def_, + cf_name, + package_requirements=None, + timeout_seconds=600, + max_instance_count=None, ): """Create a cloud function from the given user defined function.""" @@ -411,14 +416,16 @@ def create_cloud_function( ) function.service_config = functions_v2.ServiceConfig() function.service_config.available_memory = "1024M" - if cloud_function_timeout is not None: - if cloud_function_timeout > 1200: + if timeout_seconds is not None: + if timeout_seconds > 1200: raise ValueError( "BigQuery remote function can wait only up to 20 minutes" ", see for more details " "https://cloud.google.com/bigquery/quotas#remote_function_limits." ) - function.service_config.timeout_seconds = cloud_function_timeout + function.service_config.timeout_seconds = timeout_seconds + if max_instance_count is not None: + function.service_config.max_instance_count = max_instance_count function.service_config.service_account_email = ( self._cloud_function_service_account ) @@ -466,6 +473,7 @@ def provision_bq_remote_function( package_requirements, max_batching_rows, cloud_function_timeout, + cloud_function_max_instance_count, ): """Provision a BigQuery remote function.""" # If reuse of any existing function with the same name (indicated by the @@ -487,7 +495,11 @@ def provision_bq_remote_function( # Create the cloud function if it does not exist if not cf_endpoint: cf_endpoint = self.create_cloud_function( - def_, cloud_function_name, package_requirements, cloud_function_timeout + def_, + cloud_function_name, + package_requirements, + cloud_function_timeout, + cloud_function_max_instance_count, ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") @@ -642,6 +654,7 @@ def remote_function( cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -778,6 +791,14 @@ def remote_function( https://cloud.google.com/bigquery/quotas#remote_function_limits. By default BigQuery DataFrames uses a 10 minute timeout. `None` can be passed to let the cloud functions default timeout take effect. + cloud_function_max_instances (int, Optional): + The maximumm instance count for the cloud function created. This + can be used to control how many cloud function instances can be + active at max at any given point of time. Lower setting can help + control the spike in the billing. Higher setting can help + support processing larger scale data. When not specified, cloud + function's default setting applies. For more details see + https://cloud.google.com/functions/docs/configuring/max-instances """ if isinstance(input_types, type): input_types = [input_types] @@ -906,6 +927,7 @@ def wrapper(f): packages, max_batching_rows, cloud_function_timeout, + cloud_function_max_instances, ) # TODO: Move ibis logic to compiler step diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index fc87bec3df..2200fd6aa4 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -652,6 +652,7 @@ def remote_function( cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -667,6 +668,7 @@ def remote_function( cloud_function_docker_repository=cloud_function_docker_repository, max_batching_rows=max_batching_rows, cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instances=cloud_function_max_instances, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2425369edd..ac03b56f94 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1466,6 +1466,7 @@ def remote_function( cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1580,6 +1581,14 @@ def remote_function( https://cloud.google.com/bigquery/quotas#remote_function_limits. By default BigQuery DataFrames uses a 10 minute timeout. `None` can be passed to let the cloud functions default timeout take effect. + cloud_function_max_instances (int, Optional): + The maximumm instance count for the cloud function created. This + can be used to control how many cloud function instances can be + active at max at any given point of time. Lower setting can help + control the spike in the billing. Higher setting can help + support processing larger scale data. When not specified, cloud + function's default setting applies. For more details see + https://cloud.google.com/functions/docs/configuring/max-instances Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1603,6 +1612,7 @@ def remote_function( cloud_function_docker_repository=cloud_function_docker_repository, max_batching_rows=max_batching_rows, cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instances=cloud_function_max_instances, ) def read_gbq_function( diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index eb7cb8308b..eb2a0884fe 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1414,3 +1414,43 @@ def test_remote_function_gcf_timeout_max_supported_exceeded(session): @session.remote_function([int], int, reuse=False, cloud_function_timeout=1201) def square(x): return x * x + + +@pytest.mark.parametrize( + ("max_instances_args", "expected_max_instances"), + [ + pytest.param({}, 100, id="no-set"), + pytest.param({"cloud_function_max_instances": None}, 100, id="set-None"), + pytest.param({"cloud_function_max_instances": 1000}, 1000, id="set-explicit"), + ], +) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_max_instances( + session, scalars_dfs, max_instances_args, expected_max_instances +): + try: + + def square(x): + return x * x + + square_remote = session.remote_function( + [int], int, reuse=False, **max_instances_args + )(square) + + # Assert that the GCF is created with the intended max instance count + gcf = session.cloudfunctionsclient.get_function( + name=square_remote.bigframes_cloud_function + ) + assert gcf.service_config.max_instance_count == expected_max_instances + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_too"].apply(square_remote).to_pandas() + pd_result = scalars_pandas_df["int64_too"].apply(square) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square_remote + ) From f5617994bc136de5caa72719b8c3c297c512cb36 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Fri, 3 May 2024 10:25:15 -0700 Subject: [PATCH 11/17] feat: custom query labels for compute options (#638) * feat: Custom query labels for compute options * Update docstring * Update test * Code example update. * ignore type * update format --- bigframes/_config/compute_options.py | 53 +++++++++++++++++++++++++++- bigframes/session/__init__.py | 6 ++++ tests/system/small/test_session.py | 18 ++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 81ef044f4d..c8a54fe0b3 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -15,7 +15,7 @@ """Options for displaying objects.""" import dataclasses -from typing import Optional +from typing import Any, Dict, Optional @dataclasses.dataclass @@ -34,6 +34,26 @@ class ComputeOptions: >>> bpd.options.compute.maximum_bytes_billed = None # reset option + To add multiple extra labels to a query configuration, use the `assign_extra_query_labels` + method with keyword arguments: + + >>> bpd.options.compute.assign_extra_query_labels(test1=1, test2="abc") + >>> bpd.options.compute.extra_query_labels + {'test1': 1, 'test2': 'abc'} + + Alternatively, you can add labels individually by directly accessing the `extra_query_labels` + dictionary: + + >>> bpd.options.compute.extra_query_labels["test3"] = False + >>> bpd.options.compute.extra_query_labels + {'test1': 1, 'test2': 'abc', 'test3': False} + + To remove a label from the configuration, use the `del` keyword on the desired label key: + + >>> del bpd.options.compute.extra_query_labels["test1"] + >>> bpd.options.compute.extra_query_labels + {'test2': 'abc', 'test3': False} + Attributes: maximum_bytes_billed (int, Options): Limits the bytes billed for query jobs. Queries that will have @@ -44,7 +64,38 @@ class ComputeOptions: If enabled, large queries may be factored into multiple smaller queries in order to avoid generating queries that are too complex for the query engine to handle. However this comes at the cost of increase cost and latency. + extra_query_labels (Dict[str, Any], Options): + Stores additional custom labels for query configuration. """ maximum_bytes_billed: Optional[int] = None enable_multi_query_execution: bool = False + extra_query_labels: Dict[str, Any] = dataclasses.field( + default_factory=dict, init=False + ) + + def assign_extra_query_labels(self, **kwargs: Any) -> None: + """ + Assigns additional custom labels for query configuration. The method updates the + `extra_query_labels` dictionary with new labels provided through keyword arguments. + + Args: + kwargs (Any): + Custom labels provided as keyword arguments. Each key-value pair + in `kwargs` represents a label name and its value. + + Raises: + ValueError: If a key matches one of the reserved attribute names, + specifically 'maximum_bytes_billed' or 'enable_multi_query_execution', + to prevent conflicts with built-in settings. + """ + reserved_keys = ["maximum_bytes_billed", "enable_multi_query_execution"] + for key in kwargs: + if key in reserved_keys: + raise ValueError( + f"'{key}' is a reserved attribute name. Please use " + "a different key for your custom labels to avoid " + "conflicts with built-in settings." + ) + + self.extra_query_labels.update(kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ac03b56f94..7c7d93541c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1699,6 +1699,12 @@ def _prepare_query_job_config( bigframes.options.compute.maximum_bytes_billed ) + current_labels = job_config.labels if job_config.labels else {} + for key, value in bigframes.options.compute.extra_query_labels.items(): + if key not in current_labels: + current_labels[key] = value + job_config.labels = current_labels + if self._bq_kms_key_name: job_config.destination_encryption_configuration = ( bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2779874d6c..6b2d7df50d 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -405,6 +405,24 @@ def test_read_gbq_with_configuration( assert df.shape == (9, 3) +def test_read_gbq_with_custom_global_labels( + session: bigframes.Session, scalars_table_id: str +): + bigframes.options.compute.assign_extra_query_labels(test1=1, test2="abc") + bigframes.options.compute.extra_query_labels["test3"] = False + + job_labels = session.read_gbq(scalars_table_id).query_job.labels # type:ignore + expected_labels = {"test1": "1", "test2": "abc", "test3": "false"} + + assert all(job_labels.get(key) == value for key, value in expected_labels.items()) + + del bigframes.options.compute.extra_query_labels["test1"] + del bigframes.options.compute.extra_query_labels["test2"] + del bigframes.options.compute.extra_query_labels["test3"] + + assert len(bigframes.options.compute.extra_query_labels) == 0 + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) From 2715d2b4a353710175a66a4f6149356f583f2c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 3 May 2024 19:54:33 -0500 Subject: [PATCH 12/17] fix: downgrade NoDefaultIndexError to DefaultIndexWarning (#658) --- bigframes/exceptions.py | 4 +- .../session/_io/bigquery/read_gbq_table.py | 7 ++-- tests/unit/session/test_session.py | 39 +++++++++++-------- .../bigframes_vendored/pandas/io/gbq.py | 2 +- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 222df069f6..5caf2aa1df 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -25,5 +25,5 @@ class CleanupFailedWarning(Warning): """Bigframes failed to clean up a table resource.""" -class NoDefaultIndexError(ValueError): - """Unable to create a default index.""" +class DefaultIndexWarning(Warning): + """Default index may cause unexpected costs.""" diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 29d5a5567f..f6c1463e6c 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -277,13 +277,14 @@ def get_index_cols_and_uniqueness( # resource utilization because of the default sequential index. See # internal issue 335727141. if _is_table_clustered_or_partitioned(table) and not primary_keys: - raise bigframes.exceptions.NoDefaultIndexError( + warnings.warn( f"Table '{str(table.reference)}' is clustered and/or " "partitioned, but BigQuery DataFrames was not able to find a " - "suitable index. To avoid this error, set at least one of: " + "suitable index. To avoid this warning, set at least one of: " # TODO(b/338037499): Allow max_results to override this too, # once we make it more efficient. - "`index_col` or `filters`." + "`index_col` or `filters`.", + category=bigframes.exceptions.DefaultIndexWarning, ) # If there are primary keys defined, the query engine assumes these diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 70a121435c..a161c2df76 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -17,6 +17,7 @@ import os import re from unittest import mock +import warnings import google.api_core.exceptions import google.cloud.bigquery @@ -186,7 +187,7 @@ def get_table_mock(table_ref): @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_raised_by_read_gbq(table): +def test_default_index_warning_raised_by_read_gbq(table): """Because of the windowing operation to create a default index, row filters can't push down to the clustering column. @@ -202,12 +203,12 @@ def test_no_default_index_error_raised_by_read_gbq(table): session = resources.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location - with pytest.raises(bigframes.exceptions.NoDefaultIndexError): + with pytest.warns(bigframes.exceptions.DefaultIndexWarning): session.read_gbq("my-project.my_dataset.my_table") @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int64( +def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64( table, ): """Because of the windowing operation to create a default index, row @@ -224,11 +225,13 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int6 session = resources.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location - # No exception raised because we set the option allowing the default indexes. - df = session.read_gbq( - "my-project.my_dataset.my_table", - index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, - ) + # No warnings raised because we set the option allowing the default indexes. + with warnings.catch_warnings(): + warnings.simplefilter("error", bigframes.exceptions.DefaultIndexWarning) + df = session.read_gbq( + "my-project.my_dataset.my_table", + index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, + ) # We expect a window operation because we specificaly requested a sequential index. generated_sql = df.sql.casefold() @@ -246,7 +249,7 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int6 ), ) @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( +def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( total_count, distinct_count, table, @@ -270,10 +273,12 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( ) table._properties["location"] = session._location - # No exception raised because there are columns to use as the index. - df = session.read_gbq( - "my-project.my_dataset.my_table", index_col=("idx_1", "idx_2") - ) + # No warning raised because there are columns to use as the index. + with warnings.catch_warnings(): + warnings.simplefilter("error", bigframes.exceptions.DefaultIndexWarning) + df = session.read_gbq( + "my-project.my_dataset.my_table", index_col=("idx_1", "idx_2") + ) # There should be no analytic operators to prevent row filtering pushdown. assert "OVER" not in df.sql @@ -281,7 +286,7 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_not_raised_by_read_gbq_primary_key(table): +def test_default_index_warning_not_raised_by_read_gbq_primary_key(table): """If a primary key is set on the table, we use that as the index column by default, no error should be raised in this case. @@ -310,8 +315,10 @@ def test_no_default_index_error_not_raised_by_read_gbq_primary_key(table): ) table._properties["location"] = session._location - # No exception raised because there is a primary key to use as the index. - df = session.read_gbq("my-project.my_dataset.my_table") + # No warning raised because there is a primary key to use as the index. + with warnings.catch_warnings(): + warnings.simplefilter("error", bigframes.exceptions.DefaultIndexWarning) + df = session.read_gbq("my-project.my_dataset.my_table") # There should be no analytic operators to prevent row filtering pushdown. assert "OVER" not in df.sql diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index c25dd8776f..38ea208eaf 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -157,7 +157,7 @@ def read_gbq( Alias for columns, retained for backwards compatibility. Raises: - bigframes.exceptions.NoDefaultIndexError: + bigframes.exceptions.DefaultIndexWarning: Using the default index is discouraged, such as with clustered or partitioned tables without primary keys. From 651fd7daf14273f172c6c55e5d6c374eb590a22d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Sun, 5 May 2024 20:02:48 -0500 Subject: [PATCH 13/17] feat: `bigframes.options` and `bigframes.option_context` now uses thread-local variables to prevent context managers in separate threads from affecting each other (#652) * feat: `bigframes.options` and `bigframes.option_context` now uses thread-local variables to prevent context managers in separate threads from affecting each other In our tests, this allows us to actually test things like `bf.option_context("display.repr_mode", "deferred"):` without always having some other test change the display mode and break the test. Fixes internal issue 308657813 * catch close errors on thread-local session too * use presence of _local.bigquery_options to indicate thread locality feat: always do a query dry run when `option.repr_mode == "deferred"` (#652) --- bigframes/_config/__init__.py | 74 +++++++++-- bigframes/core/global_session.py | 60 +++++++-- bigframes/core/indexes/base.py | 2 +- bigframes/dataframe.py | 6 +- bigframes/series.py | 2 +- tests/system/small/ml/test_llm.py | 120 ++++++++++-------- tests/system/small/test_dataframe.py | 17 +-- tests/system/small/test_pandas_options.py | 29 ++++- tests/system/small/test_progress_bar.py | 55 +++++--- .../pandas/_config/config.py | 29 ++++- 10 files changed, 275 insertions(+), 119 deletions(-) diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index bdd7a8f2d6..bf33420e60 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -17,6 +17,9 @@ DataFrames from this package. """ +import copy +import threading + import bigframes_vendored.pandas._config.config as pandas_config import bigframes._config.bigquery_options as bigquery_options @@ -29,44 +32,91 @@ class Options: """Global options affecting BigQuery DataFrames behavior.""" def __init__(self): + self._local = threading.local() + + # Initialize these in the property getters to make sure we do have a + # separate instance per thread. + self._local.bigquery_options = None + self._local.display_options = None + self._local.sampling_options = None + self._local.compute_options = None + + # BigQuery options are special because they can only be set once per + # session, so we need an indicator as to whether we are using the + # thread-local session or the global session. self._bigquery_options = bigquery_options.BigQueryOptions() - self._display_options = display_options.DisplayOptions() - self._sampling_options = sampling_options.SamplingOptions() - self._compute_options = compute_options.ComputeOptions() + + def _init_bigquery_thread_local(self): + """Initialize thread-local options, based on current global options.""" + + # Already thread-local, so don't reset any options that have been set + # already. No locks needed since this only modifies thread-local + # variables. + if self._local.bigquery_options is not None: + return + + self._local.bigquery_options = copy.deepcopy(self._bigquery_options) + self._local.bigquery_options._session_started = False @property def bigquery(self) -> bigquery_options.BigQueryOptions: """Options to use with the BigQuery engine.""" + if self._local.bigquery_options is not None: + # The only way we can get here is if someone called + # _init_bigquery_thread_local. + return self._local.bigquery_options + return self._bigquery_options @property def display(self) -> display_options.DisplayOptions: """Options controlling object representation.""" - return self._display_options + if self._local.display_options is None: + self._local.display_options = display_options.DisplayOptions() + + return self._local.display_options @property def sampling(self) -> sampling_options.SamplingOptions: """Options controlling downsampling when downloading data - to memory. The data will be downloaded into memory explicitly + to memory. + + The data can be downloaded into memory explicitly (e.g., to_pandas, to_numpy, values) or implicitly (e.g., matplotlib plotting). This option can be overriden by - parameters in specific functions.""" - return self._sampling_options + parameters in specific functions. + """ + if self._local.sampling_options is None: + self._local.sampling_options = sampling_options.SamplingOptions() + + return self._local.sampling_options @property def compute(self) -> compute_options.ComputeOptions: - """Options controlling object computation.""" - return self._compute_options + """Thread-local options controlling object computation.""" + if self._local.compute_options is None: + self._local.compute_options = compute_options.ComputeOptions() + + return self._local.compute_options + + @property + def is_bigquery_thread_local(self) -> bool: + """Indicator that we're using a thread-local session. + + A thread-local session can be started by using + `with bigframes.option_context("bigquery.some_option", "some-value"):`. + """ + return self._local.bigquery_options is not None options = Options() """Global options for default session.""" +option_context = pandas_config.option_context + __all__ = ( "Options", "options", + "option_context", ) - - -option_context = pandas_config.option_context diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 31dfc9bd17..3187c5c11b 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -26,6 +26,24 @@ _global_session: Optional[bigframes.session.Session] = None _global_session_lock = threading.Lock() +_global_session_state = threading.local() +_global_session_state.thread_local_session = None + + +def _try_close_session(session): + """Try to close the session and warn if couldn't.""" + try: + session.close() + except google.auth.exceptions.RefreshError as e: + session_id = session.session_id + location = session._location + project_id = session._project + warnings.warn( + f"Session cleanup failed for session with id: {session_id}, " + f"location: {location}, project: {project_id}", + category=bigframes.exceptions.CleanupFailedWarning, + ) + traceback.print_tb(e.__traceback__) def close_session() -> None: @@ -37,24 +55,30 @@ def close_session() -> None: Returns: None """ - global _global_session + global _global_session, _global_session_lock, _global_session_state + + if bigframes._config.options.is_bigquery_thread_local: + if _global_session_state.thread_local_session is not None: + _try_close_session(_global_session_state.thread_local_session) + _global_session_state.thread_local_session = None + + # Currently using thread-local options, so no global lock needed. + # Don't reset options.bigquery, as that's the responsibility + # of the context manager that started it in the first place. The user + # might have explicitly closed the session in the context manager and + # the thread-locality property needs to be retained. + bigframes._config.options.bigquery._session_started = False + + # Don't close the non-thread-local session. + return with _global_session_lock: if _global_session is not None: - try: - _global_session.close() - except google.auth.exceptions.RefreshError as e: - session_id = _global_session.session_id - location = _global_session._location - project_id = _global_session._project - warnings.warn( - f"Session cleanup failed for session with id: {session_id}, " - f"location: {location}, project: {project_id}", - category=bigframes.exceptions.CleanupFailedWarning, - ) - traceback.print_tb(e.__traceback__) + _try_close_session(_global_session) _global_session = None + # This should be global, not thread-local because of the if clause + # above. bigframes._config.options.bigquery._session_started = False @@ -63,7 +87,15 @@ def get_global_session(): Creates the global session if it does not exist. """ - global _global_session, _global_session_lock + global _global_session, _global_session_lock, _global_session_state + + if bigframes._config.options.is_bigquery_thread_local: + if _global_session_state.thread_local_session is None: + _global_session_state.thread_local_session = bigframes.session.connect( + bigframes._config.options.bigquery + ) + + return _global_session_state.thread_local_session with _global_session_lock: if _global_session is None: diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 46a9e30637..569dae4ffc 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -239,7 +239,7 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows if opts.repr_mode == "deferred": - return formatter.repr_query_job(self.query_job) + return formatter.repr_query_job(self._block._compute_dry_run()) pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._query_job = query_job diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d694216ebe..1f1fb5467f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -595,7 +595,7 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows if opts.repr_mode == "deferred": - return formatter.repr_query_job(self.query_job) + return formatter.repr_query_job(self._compute_dry_run()) self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe @@ -632,9 +632,9 @@ def _repr_html_(self) -> str: many notebooks are not configured for large tables. """ opts = bigframes.options.display - max_results = bigframes.options.display.max_rows + max_results = opts.max_rows if opts.repr_mode == "deferred": - return formatter.repr_query_job_html(self.query_job) + return formatter.repr_query_job(self._compute_dry_run()) self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe diff --git a/bigframes/series.py b/bigframes/series.py index 3986d38445..aea3d60ff5 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -282,7 +282,7 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows if opts.repr_mode == "deferred": - return formatter.repr_query_job(self.query_job) + return formatter.repr_query_job(self._compute_dry_run()) self._cached() pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 6f6b67597a..8a6874b178 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -55,25 +55,28 @@ def test_create_text_generator_model_default_session( ): import bigframes.pandas as bpd - bpd.close_session() - bpd.options.bigquery.bq_connection = bq_connection - bpd.options.bigquery.location = "us" - - model = llm.PaLM2TextGenerator() - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-rf-conn" - ) - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - df = model.predict(llm_text_df).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_text_llm_result" in df.columns - series = df["ml_generate_text_llm_result"] - assert all(series.str.len() > 20) + # Note: This starts a thread-local session. + with bpd.option_context( + "bigquery.bq_connection", + bq_connection, + "bigquery.location", + "US", + ): + model = llm.PaLM2TextGenerator() + assert model is not None + assert model._bqml_model is not None + assert ( + model.connection_name.casefold() + == f"{bigquery_client.project}.us.bigframes-rf-conn" + ) + + llm_text_df = bpd.read_pandas(llm_text_pandas_df) + + df = model.predict(llm_text_df).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) @pytest.mark.flaky(retries=2) @@ -82,25 +85,28 @@ def test_create_text_generator_32k_model_default_session( ): import bigframes.pandas as bpd - bpd.close_session() - bpd.options.bigquery.bq_connection = bq_connection - bpd.options.bigquery.location = "us" - - model = llm.PaLM2TextGenerator(model_name="text-bison-32k") - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-rf-conn" - ) - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - df = model.predict(llm_text_df).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_text_llm_result" in df.columns - series = df["ml_generate_text_llm_result"] - assert all(series.str.len() > 20) + # Note: This starts a thread-local session. + with bpd.option_context( + "bigquery.bq_connection", + bq_connection, + "bigquery.location", + "US", + ): + model = llm.PaLM2TextGenerator(model_name="text-bison-32k") + assert model is not None + assert model._bqml_model is not None + assert ( + model.connection_name.casefold() + == f"{bigquery_client.project}.us.bigframes-rf-conn" + ) + + llm_text_df = bpd.read_pandas(llm_text_pandas_df) + + df = model.predict(llm_text_df).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) @pytest.mark.flaky(retries=2) @@ -232,27 +238,33 @@ def test_create_embedding_generator_multilingual_model( def test_create_text_embedding_generator_model_defaults(bq_connection): import bigframes.pandas as bpd - bpd.close_session() - bpd.options.bigquery.bq_connection = bq_connection - bpd.options.bigquery.location = "us" - - model = llm.PaLM2TextEmbeddingGenerator() - assert model is not None - assert model._bqml_model is not None + # Note: This starts a thread-local session. + with bpd.option_context( + "bigquery.bq_connection", + bq_connection, + "bigquery.location", + "US", + ): + model = llm.PaLM2TextEmbeddingGenerator() + assert model is not None + assert model._bqml_model is not None def test_create_text_embedding_generator_multilingual_model_defaults(bq_connection): import bigframes.pandas as bpd - bpd.close_session() - bpd.options.bigquery.bq_connection = bq_connection - bpd.options.bigquery.location = "us" - - model = llm.PaLM2TextEmbeddingGenerator( - model_name="textembedding-gecko-multilingual" - ) - assert model is not None - assert model._bqml_model is not None + # Note: This starts a thread-local session. + with bpd.option_context( + "bigquery.bq_connection", + bq_connection, + "bigquery.location", + "US", + ): + model = llm.PaLM2TextEmbeddingGenerator( + model_name="textembedding-gecko-multilingual" + ) + assert model is not None + assert model._bqml_model is not None @pytest.mark.flaky(retries=2) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b428207314..5ed6908640 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -142,18 +142,13 @@ def test_df_construct_from_dict(): def test_df_construct_inline_respects_location(): import bigframes.pandas as bpd - bpd.close_session() - bpd.options.bigquery.location = "europe-west1" + # Note: This starts a thread-local session. + with bpd.option_context("bigquery.location", "europe-west1"): + df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) + repr(df) - df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) - repr(df) - - table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) - assert table.location == "europe-west1" - - # Reset global session - bpd.close_session() - bpd.options.bigquery.location = "us" + table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) + assert table.location == "europe-west1" def test_get_column(scalars_dfs): diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index dd13196981..afb75c65e3 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -27,8 +27,11 @@ @pytest.fixture(autouse=True) def reset_default_session_and_location(): - bpd.close_session() - bpd.options.bigquery.location = None + # Note: This starts a thread-local session and closes it once the test + # finishes. + with bpd.option_context("bigquery.location", None): + bpd.options.bigquery.location = None + yield @pytest.mark.parametrize( @@ -80,7 +83,9 @@ def test_read_gbq_start_sets_session_location( ): read_method(query) - # Close global session to start over + # Close the global session to start over. + # Note: This is a thread-local operation because of the + # reset_default_session_and_location fixture above. bpd.close_session() # There should still be the previous location set in the bigquery options @@ -289,13 +294,25 @@ def test_credentials_need_reauthentication(monkeypatch): with pytest.raises(google.auth.exceptions.RefreshError): bpd.read_gbq(test_query) - # Now verify that closing the session works and we throw - # the expected warning + # Now verify that closing the session works We look at the + # thread-local session because of the + # reset_default_session_and_location fixture and that this test mutates + # state that might otherwise be used by tests running in parallel. + assert ( + bigframes.core.global_session._global_session_state.thread_local_session + is not None + ) + with warnings.catch_warnings(record=True) as warned: bpd.close_session() # CleanupFailedWarning: can't clean up + assert len(warned) == 1 assert warned[0].category == bigframes.exceptions.CleanupFailedWarning - assert bigframes.core.global_session._global_session is None + + assert ( + bigframes.core.global_session._global_session_state.thread_local_session + is None + ) # Now verify that use is able to start over df = bpd.read_gbq(test_query) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 5ccc6db0ac..73a9743e2f 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -23,33 +23,37 @@ from bigframes.session import MAX_INLINE_DF_BYTES job_load_message_regex = r"\w+ job [\w-]+ is \w+\." +EXPECTED_DRY_RUN_MESSAGE = "Computation deferred. Computation will process" def test_progress_bar_dataframe( penguins_df_default_index: bf.dataframe.DataFrame, capsys ): - bf.options.display.progress_bar = "terminal" capsys.readouterr() # clear output - penguins_df_default_index.to_pandas() + + with bf.option_context("display.progress_bar", "terminal"): + penguins_df_default_index.to_pandas() assert_loading_msg_exist(capsys.readouterr().out) assert penguins_df_default_index.query_job is not None def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "terminal" series = penguins_df_default_index["body_mass_g"].head(10) capsys.readouterr() # clear output - series.to_pandas() + + with bf.option_context("display.progress_bar", "terminal"): + series.to_pandas() assert_loading_msg_exist(capsys.readouterr().out) assert series.query_job is not None def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "terminal" capsys.readouterr() # clear output - penguins_df_default_index["body_mass_g"].head(10).mean() + + with bf.option_context("display.progress_bar", "terminal"): + penguins_df_default_index["body_mass_g"].head(10).mean() assert_loading_msg_exist(capsys.readouterr().out) @@ -57,10 +61,11 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, def test_progress_bar_extract_jobs( penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys ): - bf.options.display.progress_bar = "terminal" path = gcs_folder + "test_read_csv_progress_bar*.csv" capsys.readouterr() # clear output - penguins_df_default_index.to_csv(path) + + with bf.option_context("display.progress_bar", "terminal"): + penguins_df_default_index.to_csv(path) assert_loading_msg_exist(capsys.readouterr().out) @@ -73,8 +78,9 @@ def test_progress_bar_load_jobs( while len(df) < MAX_INLINE_DF_BYTES: df = pd.DataFrame(np.repeat(df.values, 2, axis=0)) - bf.options.display.progress_bar = "terminal" - with tempfile.TemporaryDirectory() as dir: + with bf.option_context( + "display.progress_bar", "terminal" + ), tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_progress_bar*.csv" df.to_csv(path, index=False) capsys.readouterr() # clear output @@ -96,11 +102,12 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): - bf.options.display.progress_bar = "terminal" - penguins_df_default_index.to_pandas() - query_job_repr = formatting_helpers.repr_query_job_html( - penguins_df_default_index.query_job - ).value + with bf.option_context("display.progress_bar", "terminal"): + penguins_df_default_index.to_pandas() + query_job_repr = formatting_helpers.repr_query_job_html( + penguins_df_default_index.query_job + ).value + string_checks = [ "Job Id", "Destination Table", @@ -126,3 +133,21 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): ] for string in string_checks: assert string in query_job_repr + + +def test_query_job_dry_run_dataframe(penguins_df_default_index: bf.dataframe.DataFrame): + with bf.option_context("display.repr_mode", "deferred"): + df_result = repr(penguins_df_default_index) + assert EXPECTED_DRY_RUN_MESSAGE in df_result + + +def test_query_job_dry_run_index(penguins_df_default_index: bf.dataframe.DataFrame): + with bf.option_context("display.repr_mode", "deferred"): + index_result = repr(penguins_df_default_index.index) + assert EXPECTED_DRY_RUN_MESSAGE in index_result + + +def test_query_job_dry_run_series(penguins_df_default_index: bf.dataframe.DataFrame): + with bf.option_context("display.repr_mode", "deferred"): + series_result = repr(penguins_df_default_index["body_mass_g"]) + assert EXPECTED_DRY_RUN_MESSAGE in series_result diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py index 1b73e649c8..13ccfdac89 100644 --- a/third_party/bigframes_vendored/pandas/_config/config.py +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -7,10 +7,17 @@ class option_context(contextlib.ContextDecorator): """ - Context manager to temporarily set options in the `with` statement context. + Context manager to temporarily set thread-local options in the `with` + statement context. You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + .. note:: + + `"bigquery"` options can't be changed on a running session. Setting any + of these options creates a new thread-local session that only lives for + the lifetime of the context manager. + **Examples:** >>> import bigframes @@ -29,7 +36,11 @@ def __init__(self, *args) -> None: def __enter__(self) -> None: self.undo = [ - (pat, operator.attrgetter(pat)(bigframes.options)) for pat, val in self.ops + (pat, operator.attrgetter(pat)(bigframes.options)) + for pat, _ in self.ops + # Don't try to undo changes to bigquery options. We're starting and + # closing a new thread-local session if those are set. + if not pat.startswith("bigquery.") ] for pat, val in self.ops: @@ -40,7 +51,21 @@ def __exit__(self, *args) -> None: for pat, val in self.undo: self._set_option(pat, val) + # TODO(tswast): What to do if someone nests several context managers + # with separate "bigquery" options? We might need a "stack" of + # sessions if we allow that. + if bigframes.options.is_bigquery_thread_local: + bigframes.close_session() + + # Reset bigquery_options so that we're no longer thread-local. + bigframes.options._local.bigquery_options = None + def _set_option(self, pat, val): root, attr = pat.rsplit(".", 1) + + # We are now using a thread-specific session. + if root == "bigquery": + bigframes.options._init_bigquery_thread_local() + parent = operator.attrgetter(root)(bigframes.options) setattr(parent, attr, val) From 4a342933559fba417fe42e2bd386838defdb2778 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Mon, 6 May 2024 12:50:17 -0700 Subject: [PATCH 14/17] fix: fix bug with na in the column labels in stack (#659) --- bigframes/core/__init__.py | 5 ++++- tests/system/small/test_multiindex.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 185ce7cd4f..eef0efcf83 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -429,7 +429,10 @@ def _create_unpivot_labels_array( for row_offset in range(len(former_column_labels)): row_label = former_column_labels[row_offset] row_label = (row_label,) if not isinstance(row_label, tuple) else row_label - row = {col_ids[i]: row_label[i] for i in range(len(col_ids))} + row = { + col_ids[i]: (row_label[i] if pandas.notnull(row_label[i]) else None) + for i in range(len(col_ids)) + } rows.append(row) return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index bb0af52976..613ad945c1 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1191,3 +1191,22 @@ def test_explode_w_multi_index(): check_dtype=False, check_index_type=False, ) + + +def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", "c", "d"]) + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index([None, "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + pd_result = pd_df.stack() + bf_result = bf_df.stack().to_pandas() + + # Pandas produces pd.NA, where bq dataframes produces NaN + pd_result["c"] = pd_result["c"].replace(pandas.NA, np.nan) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) From 16866d2bbd4901b1bf57f7e8cfbdb444d63fee6c Mon Sep 17 00:00:00 2001 From: Stephanie A <129541811+DevStephanie@users.noreply.github.com> Date: Mon, 6 May 2024 17:41:35 -0500 Subject: [PATCH 15/17] docs: add python code sample fore multiple forecasting time series (#531) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add python code sample to multiple timeseries forecasting --------- Co-authored-by: Tim Sweña (Swast) --- ...e_multiple_timeseries_forecasting_model.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 samples/snippets/create_multiple_timeseries_forecasting_model.py diff --git a/samples/snippets/create_multiple_timeseries_forecasting_model.py b/samples/snippets/create_multiple_timeseries_forecasting_model.py new file mode 100644 index 0000000000..26fc15595f --- /dev/null +++ b/samples/snippets/create_multiple_timeseries_forecasting_model.py @@ -0,0 +1,98 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_multiple_timeseries_forecasting_model(random_model_id): + your_model_id = random_model_id + + # [START bigquery_dataframes_bqml_arima_multiple_step_2_visualize] + + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + date = df["starttime"].dt.date + df.groupby([date]) + num_trips = features.groupby(["date"]).count() + + # Results from running "print(num_trips)" + + # num_trips + # date + # 2013-07-01 16650 + # 2013-07-02 22745 + # 2013-07-03 21864 + # 2013-07-04 22326 + # 2013-07-05 21842 + # 2013-07-06 20467 + # 2013-07-07 20477 + # 2013-07-08 21615 + # 2013-07-09 26641 + # 2013-07-10 25732 + # 2013-07-11 24417 + # 2013-07-12 19006 + # 2013-07-13 26119 + # 2013-07-14 29287 + # 2013-07-15 28069 + # 2013-07-16 29842 + # 2013-07-17 30550 + # 2013-07-18 28869 + # 2013-07-19 26591 + # 2013-07-20 25278 + # 2013-07-21 30297 + # 2013-07-22 25979 + # 2013-07-23 32376 + # 2013-07-24 35271 + # 2013-07-25 31084 + + num_trips.plot.line( + # Rotate the x labels so they are more visible. + rot=45, + ) + + # [END bigquery_dataframes_bqml_arima_multiple_step_2_visualize] + + # [START bigquery_dataframes_bqml_arima_multiple_step_3_fit] + from bigframes.ml import forecasting + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date"], as_index=False).count() + model = forecasting.ARIMAPlus() + + X = num_trips["date"].to_frame() + y = num_trips["num_trips"].to_frame() + + model.fit(X, y) + # The model.fit() call above created a temporary model. + # Use the to_gbq() method to write to a permanent location. + + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.nyc_citibike_arima_model", + replace=True, + ) + # [END bigquery_dataframes_bqml_arima_multiple_step_3_fit] From e26ec206e27767f88a33847e135ee52935657aa2 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 6 May 2024 17:52:25 -0700 Subject: [PATCH 16/17] chore: disable BQML regression affected tests (#661) --- tests/system/large/ml/test_core.py | 3 +++ tests/system/large/ml/test_ensemble.py | 2 ++ tests/system/large/ml/test_pipeline.py | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index df387e6ee1..aec1065e41 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -13,10 +13,13 @@ # limitations under the License. import pandas +import pytest from bigframes.ml import globals +# TODO(garrettwu): Re-enable or not check exact numbers. +@pytest.mark.skip(reason="bqml regression") def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df): df = penguins_df_default_index.dropna() X_train = df[ diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 2403644a42..2260e7bbce 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -20,6 +20,8 @@ import bigframes.ml.ensemble +# TODO(garrettwu): Re-enable or not check exact numbers. +@pytest.mark.skip(reason="bqml regression") @pytest.mark.flaky(retries=2) def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBRegressor() diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index c165b1e030..1a92d0f7d4 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -222,6 +222,8 @@ def test_pipeline_logistic_regression_fit_score_predict( ) +# TODO(garrettwu): Re-enable or not check exact numbers. +@pytest.mark.skip(reason="bqml regression") @pytest.mark.flaky(retries=2) def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" @@ -297,6 +299,8 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ) +# TODO(garrettwu): Re-enable or not check exact numbers. +@pytest.mark.skip(reason="bqml regression") @pytest.mark.flaky(retries=2) def test_pipeline_random_forest_classifier_fit_score_predict( session, penguins_df_default_index From ff23b1891817c10d1d70e030cf9aa5e870a6421c Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 23:45:56 -0700 Subject: [PATCH 17/17] chore(main): release 1.5.0 (#645) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b01e78ec42..f73d4b5750 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,35 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.4.0...v1.5.0) (2024-05-07) + + +### Features + +* `bigframes.options` and `bigframes.option_context` now uses thread-local variables to prevent context managers in separate threads from affecting each other ([#652](https://github.com/googleapis/python-bigquery-dataframes/issues/652)) ([651fd7d](https://github.com/googleapis/python-bigquery-dataframes/commit/651fd7daf14273f172c6c55e5d6c374eb590a22d)) +* Add `ARIMAPlus.coef_` property exposing `ML.ARIMA_COEFFICIENTS` functionality ([#585](https://github.com/googleapis/python-bigquery-dataframes/issues/585)) ([81d1262](https://github.com/googleapis/python-bigquery-dataframes/commit/81d1262a40c133017c6debe89506d66aab7bb0c5)) +* Add a unique session_id to Session and allow cleaning up sessions ([#553](https://github.com/googleapis/python-bigquery-dataframes/issues/553)) ([c8d4e23](https://github.com/googleapis/python-bigquery-dataframes/commit/c8d4e231fe8263f5b10fae9b879ff82df58da534)) +* Add the `bigframes.bigquery` sub-package with a `bigframes.bigquery.array_length` function ([#630](https://github.com/googleapis/python-bigquery-dataframes/issues/630)) ([9963f85](https://github.com/googleapis/python-bigquery-dataframes/commit/9963f85b84c3b3c681447ab79e22ac93ac48349c)) +* Always do a query dry run when `option.repr_mode == "deferred"` ([#652](https://github.com/googleapis/python-bigquery-dataframes/issues/652)) ([651fd7d](https://github.com/googleapis/python-bigquery-dataframes/commit/651fd7daf14273f172c6c55e5d6c374eb590a22d)) +* Custom query labels for compute options ([#638](https://github.com/googleapis/python-bigquery-dataframes/issues/638)) ([f561799](https://github.com/googleapis/python-bigquery-dataframes/commit/f5617994bc136de5caa72719b8c3c297c512cb36)) +* Raise `NoDefaultIndexError` from `read_gbq` on clustered/partitioned tables with no `index_col` or `filters` set ([#631](https://github.com/googleapis/python-bigquery-dataframes/issues/631)) ([73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404)) +* Support `index_col=False` in `read_csv` and `engine="bigquery"` ([73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404)) +* Support gcf max instance count in `remote_function` ([#657](https://github.com/googleapis/python-bigquery-dataframes/issues/657)) ([36578ab](https://github.com/googleapis/python-bigquery-dataframes/commit/36578ab431119f71dda746de415d0c6417bb4de2)) + + +### Bug Fixes + +* Don't raise UnknownLocationWarning for US or EU multi-regions ([#653](https://github.com/googleapis/python-bigquery-dataframes/issues/653)) ([8e4616b](https://github.com/googleapis/python-bigquery-dataframes/commit/8e4616b896f4e0d13d8bb0424c89335d3a1fe697)) +* Downgrade NoDefaultIndexError to DefaultIndexWarning ([#658](https://github.com/googleapis/python-bigquery-dataframes/issues/658)) ([2715d2b](https://github.com/googleapis/python-bigquery-dataframes/commit/2715d2b4a353710175a66a4f6149356f583f2c45)) +* Fix bug with na in the column labels in stack ([#659](https://github.com/googleapis/python-bigquery-dataframes/issues/659)) ([4a34293](https://github.com/googleapis/python-bigquery-dataframes/commit/4a342933559fba417fe42e2bd386838defdb2778)) +* Use explicit session in `PaLM2TextGenerator` ([#651](https://github.com/googleapis/python-bigquery-dataframes/issues/651)) ([e4f13c3](https://github.com/googleapis/python-bigquery-dataframes/commit/e4f13c3633b90e32d3171976d8b27ed10049882f)) + + +### Documentation + +* Add python code sample for multiple forecasting time series ([#531](https://github.com/googleapis/python-bigquery-dataframes/issues/531)) ([16866d2](https://github.com/googleapis/python-bigquery-dataframes/commit/16866d2bbd4901b1bf57f7e8cfbdb444d63fee6c)) +* Fix the Palm2TextGenerator output token size ([#649](https://github.com/googleapis/python-bigquery-dataframes/issues/649)) ([c67e501](https://github.com/googleapis/python-bigquery-dataframes/commit/c67e501a4958ac097216cc1c0a9d5c1530c87ae5)) + ## [1.4.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.3.0...v1.4.0) (2024-04-29) diff --git a/bigframes/version.py b/bigframes/version.py index e892a8893f..5f56ef9c61 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.4.0" +__version__ = "1.5.0"