From 0e1a2c6048ce2455bface2a6ddc532171a5a711f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 12 May 2025 15:22:58 -0700 Subject: [PATCH 01/52] chore: fix AttributeError when rowiterator has no attribute in g3 (#1709) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: fix attributeerror for rowiterator has no attribute in g3 * Update bigframes/session/metrics.py Co-authored-by: Tim Sweña (Swast) * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix mypy --------- Co-authored-by: Tim Sweña (Swast) Co-authored-by: Owl Bot --- bigframes/session/metrics.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 055abd5899..6a8038e189 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -40,16 +40,15 @@ def count_job_stats( ): if query_job is None: assert row_iterator is not None - if (row_iterator.total_bytes_processed is None) or ( - row_iterator.query is None - ): + total_bytes_processed = getattr(row_iterator, "total_bytes_processed", None) + query = getattr(row_iterator, "query", None) + if total_bytes_processed is None or query is None: return - query_char_count = len(row_iterator.query) - bytes_processed = row_iterator.total_bytes_processed + self.execution_count += 1 - self.query_char_count += query_char_count - self.bytes_processed += bytes_processed - write_stats_to_disk(query_char_count, bytes_processed) + self.query_char_count += len(query) + self.bytes_processed += total_bytes_processed + write_stats_to_disk(len(query), total_bytes_processed) return stats = get_performance_stats(query_job) From 5585f7a86a3e123cdb4205765799f39639c787be Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 12 May 2025 15:49:23 -0700 Subject: [PATCH 02/52] chore: replace api_name with updated method_logger (#1660) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: replace api_name with updated method_logger * fix * not supported error fix * fix for static methods * update unimplemented tracking to include pandas.xxx * update comments * Update bigframes/core/blocks.py * add escaping * add log name override * fix test --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/core/blocks.py | 3 +- bigframes/core/log_adapter.py | 117 ++++++++++++------ bigframes/pandas/__init__.py | 64 ++++++---- bigframes/pandas/io/api.py | 3 - bigframes/series.py | 2 +- bigframes/session/__init__.py | 53 +++----- bigframes/session/_io/bigquery/__init__.py | 12 +- .../session/_io/bigquery/read_gbq_table.py | 2 - bigframes/session/anonymous_dataset.py | 1 - bigframes/session/bq_caching_executor.py | 4 - bigframes/session/loader.py | 31 +---- tests/unit/core/test_log_adapter.py | 82 +++++++----- tests/unit/session/test_io_bigquery.py | 1 - tests/unit/session/test_read_gbq_colab.py | 2 +- tests/unit/session/test_read_gbq_table.py | 2 +- 15 files changed, 191 insertions(+), 188 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d3107a0623..ccb2ffe401 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2644,9 +2644,8 @@ def _get_rows_as_json_values(self) -> Block: SELECT {select_columns_csv} FROM T1 """ # The only ways this code is used is through df.apply(axis=1) cope path - # TODO: Stop using internal API destination, query_job = self.session._loader._query_to_destination( - json_sql, cluster_candidates=[ordering_column_name], api_name="apply" + json_sql, cluster_candidates=[ordering_column_name] ) if not destination: raise ValueError(f"Query job {query_job} did not produce result table") diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 8be46f531c..6021c7075a 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -15,7 +15,7 @@ import functools import inspect import threading -from typing import List +from typing import List, Optional from google.cloud import bigquery import pandas @@ -28,6 +28,7 @@ MAX_LABELS_COUNT = 64 - 8 PANDAS_API_TRACKING_TASK = "pandas_api_tracking" PANDAS_PARAM_TRACKING_TASK = "pandas_param_tracking" +LOG_OVERRIDE_NAME = "__log_override_name__" _api_methods: List = [] _excluded_methods = ["__setattr__", "__getattr__"] @@ -37,8 +38,8 @@ def submit_pandas_labels( - bq_client: bigquery.Client, - class_name: str, + bq_client: Optional[bigquery.Client], + base_name: str, method_name: str, args=(), kwargs={}, @@ -54,7 +55,7 @@ def submit_pandas_labels( Args: bq_client (bigquery.Client): The client used to interact with BigQuery. - class_name (str): The name of the pandas class being used. + base_name (str): The name of the pandas class/module being used. method_name (str): The name of the method being invoked. args (tuple): The positional arguments passed to the method. kwargs (dict): The keyword arguments passed to the method. @@ -63,25 +64,29 @@ def submit_pandas_labels( - 'PANDAS_PARAM_TRACKING_TASK': Indicates that the unimplemented feature is a parameter of a method. """ - if method_name.startswith("_") and not method_name.startswith("__"): + if bq_client is None or ( + method_name.startswith("_") and not method_name.startswith("__") + ): return labels_dict = { "task": task, - "class_name": class_name.lower(), + "class_name": base_name.lower(), "method_name": method_name.lower(), "args_count": len(args), } - if hasattr(pandas, class_name): - cls = getattr(pandas, class_name) + # getattr(pandas, "pandas") returns pandas + # so we can also use this for pandas.function + if hasattr(pandas, base_name): + base = getattr(pandas, base_name) else: return # Omit __call__, because its not implemented on the actual instances of # DataFrame/Series, only as the constructor. - if method_name != "__call__" and hasattr(cls, method_name): - method = getattr(cls, method_name) + if method_name != "__call__" and hasattr(base, method_name): + method = getattr(base, method_name) else: return @@ -110,30 +115,29 @@ def submit_pandas_labels( bq_client.query(query, job_config=job_config) -def class_logger(decorated_cls=None, /, *, include_internal_calls=False): +def class_logger(decorated_cls=None): """Decorator that adds logging functionality to each method of the class.""" def wrap(cls): for attr_name, attr_value in cls.__dict__.items(): if callable(attr_value) and (attr_name not in _excluded_methods): if isinstance(attr_value, staticmethod): - # TODO(b/390244171) support for staticmethod - pass + setattr( + cls, + attr_name, + staticmethod(method_logger(attr_value)), + ) else: setattr( cls, attr_name, - method_logger( - attr_value, - cls, - include_internal_calls, - ), + method_logger(attr_value), ) elif isinstance(attr_value, property): setattr( cls, attr_name, - property_logger(attr_value, cls, include_internal_calls), + property_logger(attr_value), ) return cls @@ -145,33 +149,39 @@ def wrap(cls): return wrap(decorated_cls) -def method_logger(method, decorated_cls, include_internal_calls: bool): +def method_logger(method, /, *, custom_base_name: Optional[str] = None): """Decorator that adds logging functionality to a method.""" @functools.wraps(method) - def wrapper(self, *args, **kwargs): - class_name = decorated_cls.__name__ # Access decorated class name - api_method_name = str(method.__name__) - full_method_name = f"{class_name.lower()}-{api_method_name}" - + def wrapper(*args, **kwargs): + api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) + if custom_base_name is None: + qualname_parts = getattr(method, "__qualname__", method.__name__).split(".") + class_name = qualname_parts[-2] if len(qualname_parts) > 1 else "" + base_name = ( + class_name if class_name else "_".join(method.__module__.split(".")[1:]) + ) + else: + base_name = custom_base_name + + full_method_name = f"{base_name.lower()}-{api_method_name}" # Track directly called methods - if len(_call_stack) == 0 or include_internal_calls: + if len(_call_stack) == 0: add_api_method(full_method_name) _call_stack.append(full_method_name) try: - return method(self, *args, **kwargs) + return method(*args, **kwargs) except (NotImplementedError, TypeError) as e: # Log method parameters that are implemented in pandas but either missing (TypeError) # or not fully supported (NotImplementedError) in BigFrames. # Logging is currently supported only when we can access the bqclient through - # self._block.expr.session.bqclient. Also, to avoid generating multiple queries - # because of internal calls, we log only when the method is directly invoked. - if hasattr(self, "_block") and len(_call_stack) == 1: + # _block.session.bqclient. + if len(_call_stack) == 1: submit_pandas_labels( - self._block.expr.session.bqclient, - class_name, + _get_bq_client(*args, **kwargs), + base_name, api_method_name, args, kwargs, @@ -184,22 +194,23 @@ def wrapper(self, *args, **kwargs): return wrapper -def property_logger(prop, decorated_cls, include_internal_calls: bool): +def property_logger(prop): """Decorator that adds logging functionality to a property.""" - def shared_wrapper(f): - @functools.wraps(f) + def shared_wrapper(prop): + @functools.wraps(prop) def wrapped(*args, **kwargs): - class_name = decorated_cls.__name__ - property_name = f.__name__ + qualname_parts = getattr(prop, "__qualname__", prop.__name__).split(".") + class_name = qualname_parts[-2] if len(qualname_parts) > 1 else "" + property_name = prop.__name__ full_property_name = f"{class_name.lower()}-{property_name.lower()}" - if len(_call_stack) == 0 or include_internal_calls: + if len(_call_stack) == 0: add_api_method(full_property_name) _call_stack.append(full_property_name) try: - return f(*args, **kwargs) + return prop(*args, **kwargs) finally: _call_stack.pop() @@ -213,12 +224,24 @@ def wrapped(*args, **kwargs): ) +def log_name_override(name: str): + """ + Attaches a custom name to be used by logger. + """ + + def wrapper(func): + setattr(func, LOG_OVERRIDE_NAME, name) + return func + + return wrapper + + def add_api_method(api_method_name): global _lock global _api_methods with _lock: # Push the method to the front of the _api_methods list - _api_methods.insert(0, api_method_name) + _api_methods.insert(0, api_method_name.replace("<", "").replace(">", "")) # Keep the list length within the maximum limit (adjust MAX_LABELS_COUNT as needed) _api_methods = _api_methods[:MAX_LABELS_COUNT] @@ -232,3 +255,17 @@ def get_and_reset_api_methods(dry_run: bool = False): if not dry_run: _api_methods.clear() return previous_api_methods + + +def _get_bq_client(*args, **kwargs): + # Assumes that on BigFrames API errors (TypeError/NotImplementedError), + # an input arg (likely the first, e.g., 'self') has `_block.session.bqclient` + for argv in args: + if hasattr(argv, "_block"): + return argv._block.session.bqclient + + for kwargv in kwargs.values(): + if hasattr(kwargv, "_block"): + return kwargv._block.session.bqclient + + return None diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index e90f123778..d08ef4e91d 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -27,6 +27,7 @@ import pandas import bigframes._config as config +from bigframes.core import log_adapter import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes @@ -199,6 +200,7 @@ def get_default_session_id() -> str: return get_global_session().session_id +@log_adapter.method_logger def clean_up_by_session_id( session_id: str, location: Optional[str] = None, @@ -245,7 +247,6 @@ def clean_up_by_session_id( session.bqclient, location=location, project=project, - api_name="clean_up_by_session_id", ) bigframes.session._io.bigquery.delete_tables_matching_session_id( @@ -322,31 +323,33 @@ def reset_session(): except Exception: pass -# Use __all__ to let type checkers know what is part of the public API. -__all__ = [ - # Functions - "clean_up_by_session_id", - "concat", - "cut", - "get_default_session_id", - "get_dummies", - "merge", - "qcut", - "read_csv", - "read_gbq", - "read_gbq_function", - "read_gbq_model", - "read_gbq_object_table", - "read_gbq_query", - "read_gbq_table", - "read_json", - "read_pandas", - "read_parquet", - "read_pickle", - "remote_function", - "to_datetime", - "to_timedelta", - "from_glob_path", +_functions = [ + clean_up_by_session_id, + concat, + cut, + get_default_session_id, + get_dummies, + merge, + qcut, + read_csv, + read_gbq, + read_gbq_function, + read_gbq_model, + read_gbq_object_table, + read_gbq_query, + read_gbq_table, + read_json, + read_pandas, + read_parquet, + read_pickle, + remote_function, + to_datetime, + to_timedelta, + from_glob_path, +] + +_function_names = [_function.__name__ for _function in _functions] +_other_names = [ # pandas dtype attributes "NA", "BooleanDtype", @@ -371,3 +374,12 @@ def reset_session(): "reset_session", "udf", ] + +# Use __all__ to let type checkers know what is part of the public API. +__all__ = _function_names + _other_names + +_module = sys.modules[__name__] + +for _function in _functions: + _decorated_object = log_adapter.method_logger(_function, custom_base_name="pandas") + setattr(_module, _function.__name__, _decorated_object) diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index ecf8a59bb7..16548dd4ad 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -44,11 +44,8 @@ ) import bigframes._config as config -import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes -import bigframes.core.reshape -import bigframes.core.tools import bigframes.dataframe import bigframes.enums import bigframes.series diff --git a/bigframes/series.py b/bigframes/series.py index 1e29671310..2c387734d3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1492,7 +1492,7 @@ def __getattr__(self, key: str): raise AttributeError(key) elif hasattr(pandas.Series, key): log_adapter.submit_pandas_labels( - self._block.expr.session.bqclient, self.__class__.__name__, key + self._block.session.bqclient, self.__class__.__name__, key ) raise AttributeError( textwrap.dedent( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 17945f0be6..81359ebb36 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -60,7 +60,7 @@ from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients -from bigframes.core import blocks +from bigframes.core import blocks, log_adapter import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -104,6 +104,7 @@ logger = logging.getLogger(__name__) +@log_adapter.class_logger class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, @@ -445,7 +446,6 @@ def read_gbq( columns=columns, configuration=configuration, max_results=max_results, - api_name="read_gbq", use_cache=use_cache, filters=filters, dry_run=dry_run, @@ -463,7 +463,6 @@ def read_gbq( index_col=index_col, columns=columns, max_results=max_results, - api_name="read_gbq", use_cache=use_cache if use_cache is not None else True, filters=filters, dry_run=dry_run, @@ -497,6 +496,7 @@ def _read_gbq_colab( ) -> pandas.Series: ... + @log_adapter.log_name_override("read_gbq_colab") def _read_gbq_colab( self, query: str, @@ -533,7 +533,6 @@ def _read_gbq_colab( return self._loader.read_gbq_query( query=query, index_col=bigframes.enums.DefaultIndexKind.NULL, - api_name="read_gbq_colab", force_total_order=False, dry_run=typing.cast(Union[Literal[False], Literal[True]], dry_run), ) @@ -654,7 +653,6 @@ def read_gbq_query( columns=columns, configuration=configuration, max_results=max_results, - api_name="read_gbq_query", use_cache=use_cache, filters=filters, dry_run=dry_run, @@ -737,7 +735,6 @@ def read_gbq_table( index_col=index_col, columns=columns, max_results=max_results, - api_name="read_gbq_table", use_cache=use_cache, filters=filters, dry_run=dry_run, @@ -773,7 +770,6 @@ def read_gbq_table_streaming( df = self._loader.read_gbq_table( table, - api_name="read_gbq_table_steaming", enable_snapshot=False, index_col=bigframes.enums.DefaultIndexKind.NULL, ) @@ -906,7 +902,6 @@ def read_pandas( if isinstance(pandas_dataframe, pandas.Series): bf_df = self._read_pandas( pandas.DataFrame(pandas_dataframe), - "read_pandas", write_engine=write_engine, ) bf_series = series.Series(bf_df._block) @@ -916,13 +911,10 @@ def read_pandas( if isinstance(pandas_dataframe, pandas.Index): return self._read_pandas( pandas.DataFrame(index=pandas_dataframe), - "read_pandas", write_engine=write_engine, ).index if isinstance(pandas_dataframe, pandas.DataFrame): - return self._read_pandas( - pandas_dataframe, "read_pandas", write_engine=write_engine - ) + return self._read_pandas(pandas_dataframe, write_engine=write_engine) else: raise ValueError( f"read_pandas() expects a pandas.DataFrame, but got a {type(pandas_dataframe)}" @@ -931,7 +923,6 @@ def read_pandas( def _read_pandas( self, pandas_dataframe: pandas.DataFrame, - api_name: str, *, write_engine: constants.WriteEngineType = "default", ) -> dataframe.DataFrame: @@ -959,17 +950,11 @@ def _read_pandas( ) return self._read_pandas_inline(pandas_dataframe) elif write_engine == "bigquery_load": - return self._loader.read_pandas( - pandas_dataframe, method="load", api_name=api_name - ) + return self._loader.read_pandas(pandas_dataframe, method="load") elif write_engine == "bigquery_streaming": - return self._loader.read_pandas( - pandas_dataframe, method="stream", api_name=api_name - ) + return self._loader.read_pandas(pandas_dataframe, method="stream") elif write_engine == "bigquery_write": - return self._loader.read_pandas( - pandas_dataframe, method="write", api_name=api_name - ) + return self._loader.read_pandas(pandas_dataframe, method="write") else: raise ValueError(f"Got unexpected write_engine '{write_engine}'") @@ -1097,7 +1082,7 @@ def _read_csv_w_pandas_engines( encoding=encoding, **kwargs, ) - return self._read_pandas(pandas_df, api_name="read_csv", write_engine=write_engine) # type: ignore + return self._read_pandas(pandas_df, write_engine=write_engine) # type: ignore def _read_csv_w_bigquery_engine( self, @@ -1198,11 +1183,9 @@ def read_pickle( if isinstance(pandas_obj, pandas.Series): if pandas_obj.name is None: pandas_obj.name = 0 - bigframes_df = self._read_pandas(pandas_obj.to_frame(), "read_pickle") + bigframes_df = self._read_pandas(pandas_obj.to_frame()) return bigframes_df[bigframes_df.columns[0]] - return self._read_pandas( - pandas_obj, api_name="read_pickle", write_engine=write_engine - ) + return self._read_pandas(pandas_obj, write_engine=write_engine) def read_parquet( self, @@ -1248,9 +1231,7 @@ def read_parquet( engine=engine, # type: ignore **read_parquet_kwargs, ) - return self._read_pandas( - pandas_obj, api_name="read_parquet", write_engine=write_engine - ) + return self._read_pandas(pandas_obj, write_engine=write_engine) def read_json( self, @@ -1329,9 +1310,7 @@ def read_json( engine=engine, **kwargs, ) - return self._read_pandas( - pandas_df, api_name="read_json", write_engine=write_engine - ) + return self._read_pandas(pandas_df, write_engine=write_engine) def _check_file_size(self, filepath: str): max_size = 1024 * 1024 * 1024 # 1 GB in bytes @@ -1990,9 +1969,7 @@ def from_glob_path( table = self._create_object_table(path, connection) - s = self._loader.read_gbq_table(table, api_name="from_glob_path")[ - "uri" - ].str.to_blob(connection) + s = self._loader.read_gbq_table(table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() def _create_bq_connection( @@ -2045,9 +2022,7 @@ def read_gbq_object_table( table = self.bqclient.get_table(object_table) connection = table._properties["externalDataConfiguration"]["connectionId"] - s = self._loader.read_gbq_table(object_table, api_name="read_gbq_object_table")[ - "uri" - ].str.to_blob(connection) + s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 48268d925d..c08bb8d0dc 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -48,7 +48,6 @@ def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], api_methods: typing.List[str], - api_name: Optional[str] = None, ) -> Dict[str, str]: if job_configs_labels is None: job_configs_labels = {} @@ -58,9 +57,6 @@ def create_job_configs_labels( for key, value in bigframes.options.compute.extra_query_labels.items(): job_configs_labels[key] = value - if api_name is not None: - job_configs_labels["bigframes-api"] = api_name - if api_methods and "bigframes-api" not in job_configs_labels: job_configs_labels["bigframes-api"] = api_methods[0] del api_methods[0] @@ -202,7 +198,7 @@ def format_option(key: str, value: Union[bool, str]) -> str: return f"{key}={repr(value)}" -def add_and_trim_labels(job_config, api_name: Optional[str] = None): +def add_and_trim_labels(job_config): """ Add additional labels to the job configuration and trim the total number of labels to ensure they do not exceed the maximum limit allowed by BigQuery, which is 64 @@ -212,7 +208,6 @@ def add_and_trim_labels(job_config, api_name: Optional[str] = None): job_config.labels = create_job_configs_labels( job_configs_labels=job_config.labels, api_methods=api_methods, - api_name=api_name, ) @@ -223,7 +218,6 @@ def start_query_with_client( location: Optional[str] = None, project: Optional[str] = None, timeout: Optional[float] = None, - api_name: Optional[str] = None, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, *, query_with_job: bool = True, @@ -234,7 +228,7 @@ def start_query_with_client( try: # Note: Ensure no additional labels are added to job_config after this point, # as `add_and_trim_labels` ensures the label count does not exceed 64. - add_and_trim_labels(job_config, api_name=api_name) + add_and_trim_labels(job_config) if not query_with_job: results_iterator = bq_client.query_and_wait( sql, @@ -308,7 +302,6 @@ def create_bq_dataset_reference( bq_client: bigquery.Client, location=None, project=None, - api_name: str = "unknown", ) -> bigquery.DatasetReference: """Create and identify dataset(s) for temporary BQ resources. @@ -337,7 +330,6 @@ def create_bq_dataset_reference( location=location, job_config=job_config, project=project, - api_name=api_name, ) # The anonymous dataset is used by BigQuery to write query results and diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 6b1cb99c65..2dff16933f 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -165,7 +165,6 @@ def infer_unique_columns( bqclient: bigquery.Client, table: bigquery.table.Table, index_cols: List[str], - api_name: str, metadata_only: bool = False, ) -> Tuple[str, ...]: """Return a set of columns that can provide a unique row key or empty if none can be inferred. @@ -187,7 +186,6 @@ def infer_unique_columns( # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) job_config = bigquery.QueryJobConfig() - job_config.labels["bigframes-api"] = api_name results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) row = next(iter(results)) diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index c5808aa63c..c8980e159b 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -43,7 +43,6 @@ def __init__( self.dataset = bf_io_bigquery.create_bq_dataset_reference( self.bqclient, location=self._location, - api_name="session-__init__", ) self.session_id = session_id diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 9288fdf641..6614abfed2 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -263,7 +263,6 @@ def export_gcs( self.bqclient, export_data_statement, job_config=bigquery.QueryJobConfig(), - api_name=f"dataframe-to_{format.lower()}", metrics=self.metrics, ) return query_job @@ -313,7 +312,6 @@ def _run_execute_query( self, sql: str, job_config: Optional[bq_job.QueryJobConfig] = None, - api_name: Optional[str] = None, query_with_job: bool = True, ) -> Tuple[bq_table.RowIterator, Optional[bigquery.QueryJob]]: """ @@ -333,7 +331,6 @@ def _run_execute_query( self.bqclient, sql, job_config=job_config, - api_name=api_name, metrics=self.metrics, query_with_job=query_with_job, ) @@ -477,7 +474,6 @@ def _sql_as_cached_temp_table( _, query_job = self._run_execute_query( sql, job_config=job_config, - api_name="cached", ) assert query_job is not None query_job.result() diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index b630dedb7b..1e32f3d860 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -172,7 +172,6 @@ def read_pandas( self, pandas_dataframe: pandas.DataFrame, method: Literal["load", "stream", "write"], - api_name: str, ) -> dataframe.DataFrame: # TODO: Push this into from_pandas, along with index flag from bigframes import dataframe @@ -186,7 +185,7 @@ def read_pandas( managed_data = local_data.ManagedArrowTable.from_pandas(prepared_df) if method == "load": - array_value = self.load_data(managed_data, api_name=api_name) + array_value = self.load_data(managed_data) elif method == "stream": array_value = self.stream_data(managed_data) elif method == "write": @@ -202,9 +201,7 @@ def read_pandas( ) return dataframe.DataFrame(block) - def load_data( - self, data: local_data.ManagedArrowTable, api_name: Optional[str] = None - ) -> core.ArrayValue: + def load_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: """Load managed data into bigquery""" ordering_col = guid.generate_guid("load_offsets_") @@ -227,8 +224,6 @@ def load_data( job_config.parquet_options = parquet_options job_config.schema = bq_schema - if api_name: - job_config.labels = {"bigframes-api": api_name} load_table_destination = self._storage_manager.create_temp_table( bq_schema, [ordering_col] @@ -368,7 +363,6 @@ def read_gbq_table( # type: ignore[overload-overlap] columns: Iterable[str] = ..., names: Optional[Iterable[str]] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: bool = ..., filters: third_party_pandas_gbq.FiltersType = ..., enable_snapshot: bool = ..., @@ -390,7 +384,6 @@ def read_gbq_table( columns: Iterable[str] = ..., names: Optional[Iterable[str]] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: bool = ..., filters: third_party_pandas_gbq.FiltersType = ..., enable_snapshot: bool = ..., @@ -411,7 +404,6 @@ def read_gbq_table( columns: Iterable[str] = (), names: Optional[Iterable[str]] = None, max_results: Optional[int] = None, - api_name: str = "read_gbq_table", use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), enable_snapshot: bool = True, @@ -543,7 +535,6 @@ def read_gbq_table( query, index_col=index_cols, columns=columns, - api_name=api_name, use_cache=use_cache, dry_run=dry_run, ) @@ -595,7 +586,6 @@ def read_gbq_table( bqclient=self._bqclient, table=table, index_cols=index_cols, - api_name=api_name, # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique metadata_only=not self._scan_index_uniqueness, ) @@ -718,7 +708,6 @@ def read_gbq_query( # type: ignore[overload-overlap] columns: Iterable[str] = ..., configuration: Optional[Dict] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: Optional[bool] = ..., filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[False] = ..., @@ -735,7 +724,6 @@ def read_gbq_query( columns: Iterable[str] = ..., configuration: Optional[Dict] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: Optional[bool] = ..., filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[True] = ..., @@ -751,7 +739,6 @@ def read_gbq_query( columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, - api_name: str = "read_gbq_query", use_cache: Optional[bool] = None, filters: third_party_pandas_gbq.FiltersType = (), dry_run: bool = False, @@ -817,7 +804,6 @@ def read_gbq_query( destination, query_job = self._query_to_destination( query, cluster_candidates=[], - api_name=api_name, configuration=configuration, ) @@ -845,7 +831,6 @@ def read_gbq_query( index_col=index_col, columns=columns, use_cache=configuration["query"]["useQueryCache"], - api_name=api_name, force_total_order=force_total_order, # max_results and filters are omitted because they are already # handled by to_query(), above. @@ -855,7 +840,6 @@ def _query_to_destination( self, query: str, cluster_candidates: List[str], - api_name: str, configuration: dict = {"query": {"useQueryCache": True}}, do_clustering=True, ) -> Tuple[Optional[bigquery.TableReference], bigquery.QueryJob]: @@ -863,11 +847,9 @@ def _query_to_destination( # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. dry_run_config = bigquery.QueryJobConfig() dry_run_config.dry_run = True - _, dry_run_job = self._start_query( - query, job_config=dry_run_config, api_name=api_name - ) + _, dry_run_job = self._start_query(query, job_config=dry_run_config) if dry_run_job.statement_type != "SELECT": - _, query_job = self._start_query(query, api_name=api_name) + _, query_job = self._start_query(query) return query_job.destination, query_job # Create a table to workaround BigQuery 10 GB query results limit. See: @@ -905,7 +887,6 @@ def _query_to_destination( query, job_config=job_config, timeout=timeout, - api_name=api_name, ) return query_job.destination, query_job except google.api_core.exceptions.BadRequest: @@ -913,7 +894,7 @@ def _query_to_destination( # tables as the destination. For example, if the query has a # top-level ORDER BY, this conflicts with our ability to cluster # the table by the index column(s). - _, query_job = self._start_query(query, timeout=timeout, api_name=api_name) + _, query_job = self._start_query(query, timeout=timeout) return query_job.destination, query_job def _start_query( @@ -921,7 +902,6 @@ def _start_query( sql: str, job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, timeout: Optional[float] = None, - api_name: Optional[str] = None, ) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: """ Starts BigQuery query job and waits for results. @@ -939,7 +919,6 @@ def _start_query( sql, job_config=job_config, timeout=timeout, - api_name=api_name, ) assert query_job is not None return iterator, query_job diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index 811c64a27b..eba015dd9d 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -45,6 +45,10 @@ def method2(self): def method3(self): pass + @log_adapter.log_name_override("override_name") + def method4(self): + pass + @property def my_field(self): return 0 @@ -52,55 +56,57 @@ def my_field(self): return TestClass() -def test_method_logging(test_instance): +@pytest.fixture +def test_method(): + @log_adapter.method_logger + def method1(): + pass + + return method1 + + +@pytest.fixture +def test_method_w_custom_base(): + def method1(): + pass + + _decorated_method = log_adapter.method_logger(method1, custom_base_name="pandas") + + return _decorated_method + + +def test_class_attribute_logging(test_instance): test_instance.method1() test_instance.method2() + test_instance.method4() # Check if the methods were added to the _api_methods list api_methods = log_adapter.get_and_reset_api_methods() assert "testclass-method1" in api_methods assert "testclass-method2" in api_methods assert "testclass-method3" not in api_methods + assert "testclass-method4" not in api_methods + assert "testclass-override_name" in api_methods -def test_property_logging(test_instance): - test_instance.my_field - - # Check if the properties were added to the _api_methods list +def test_method_logging(test_method): + test_method() api_methods = log_adapter.get_and_reset_api_methods() - assert "testclass-my_field" in api_methods + assert "locals-method1" in api_methods -def test_method_logging__include_internal_calls(): - @log_adapter.class_logger(include_internal_calls=True) - class TestClass: - def public_method(self): - self._internal_method() - - def _internal_method(self): - pass - - TestClass().public_method() - +def test_method_logging_with_custom_base_name(test_method_w_custom_base): + test_method_w_custom_base() api_methods = log_adapter.get_and_reset_api_methods() - assert "testclass-public_method" in api_methods - assert "testclass-_internal_method" in api_methods + assert "pandas-method1" in api_methods -def test_method_logging__exclude_internal_calls(): - @log_adapter.class_logger(include_internal_calls=False) - class TestClass: - def public_method(self): - self._internal_method() - - def _internal_method(self): - pass - - TestClass().public_method() +def test_property_logging(test_instance): + test_instance.my_field + # Check if the properties were added to the _api_methods list api_methods = log_adapter.get_and_reset_api_methods() - assert "testclass-public_method" in api_methods - assert "testclass-_internal_method" not in api_methods + assert "testclass-my_field" in api_methods def test_add_api_method_limit(test_instance): @@ -176,6 +182,20 @@ def test_get_and_reset_api_methods(test_instance): "args_count": 0, }, ), + ( + "pandas", + "concat", + [[None, None]], + {"axis": 1}, + log_adapter.PANDAS_API_TRACKING_TASK, + { + "task": log_adapter.PANDAS_API_TRACKING_TASK, + "class_name": "pandas", + "method_name": "concat", + "args_count": 1, + "kwargs_0": "axis", + }, + ), ), ) def test_submit_pandas_labels( diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 14e5d1c2fe..e5e2c58d59 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -231,7 +231,6 @@ def test_start_query_with_client_labels_length_limit_met( sql, job_config, timeout=timeout, - api_name=api_name, ) assert job_config.labels is not None diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index a27abd5f6c..9afdba9eb3 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -29,7 +29,7 @@ def test_read_gbq_colab_includes_label(): continue label_values.extend(config.labels.values()) - assert "read_gbq_colab" in label_values + assert "session-read_gbq_colab" in label_values def test_read_gbq_colab_includes_formatted_values_in_dry_run(): diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index a56b4ed7ab..6a4ae7cb60 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -91,6 +91,6 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte ) table._properties["location"] = session._location - result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols, "") + result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols) assert result == expected From 8ef4de10151717f88364a909b29fa7600e959ada Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 12 May 2025 16:21:05 -0700 Subject: [PATCH 03/52] feat: support astype conversions to and from JSON dtypes (#1716) --- bigframes/core/compile/scalar_op_compiler.py | 74 +++++++++++ tests/system/small/test_series.py | 123 +++++++++++++++++- .../ibis/backends/sql/compilers/base.py | 6 + 3 files changed, 199 insertions(+), 4 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 78e373121e..a1cf72be97 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1164,6 +1164,35 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): elif to_type == ibis_dtypes.time: return x_converted.time() + if to_type == ibis_dtypes.json: + if x.type() == ibis_dtypes.string: + return parse_json_in_safe(x) if op.safe else parse_json(x) + if x.type() == ibis_dtypes.bool: + x_bool = typing.cast( + ibis_types.StringValue, + bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ), + ).lower() + return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) + if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): + x_str = bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ) + return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) + + if x.type() == ibis_dtypes.json: + if to_type == ibis_dtypes.int64: + return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) + if to_type == ibis_dtypes.float64: + return ( + cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) + ) + if to_type == ibis_dtypes.bool: + return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) + if to_type == ibis_dtypes.string: + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) @@ -2047,6 +2076,11 @@ def parse_json(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] """Converts a JSON-formatted STRING value to a JSON value.""" +@ibis_udf.scalar.builtin(name="SAFE.PARSE_JSON") +def parse_json_in_safe(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] + """Converts a JSON-formatted STRING value to a JSON value in the safe mode.""" + + @ibis_udf.scalar.builtin(name="json_set") def json_set( # type: ignore[empty-body] json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String, json_value @@ -2075,6 +2109,46 @@ def json_value( # type: ignore[empty-body] """Retrieve value of a JSON field as plain STRING.""" +@ibis_udf.scalar.builtin(name="INT64") +def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.INT64") +def cast_json_to_int64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value in the safe mode.""" + + +@ibis_udf.scalar.builtin(name="FLOAT64") +def cast_json_to_float64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.FLOAT64") +def cast_json_to_float64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="BOOL") +def cast_json_to_bool(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.BOOL") +def cast_json_to_bool_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="STRING") +def cast_json_to_string(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.STRING") +def cast_json_to_string_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + @ibis_udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body] """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 99526a65d2..7972fbe1e9 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -13,12 +13,14 @@ # limitations under the License. import datetime as dt +import json import math import re import tempfile import db_dtypes # type: ignore import geopandas as gpd # type: ignore +import google.api_core.exceptions import numpy from packaging.version import Version import pandas as pd @@ -3474,9 +3476,11 @@ def foo(x): ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), ("int64_col", "time64[us][pyarrow]"), + ("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), ("bool_col", "Float64"), + ("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), ("string_col", "binary[pyarrow]"), ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and @@ -3541,7 +3545,7 @@ def test_astype_safe(session): pd.testing.assert_series_equal(result, exepcted) -def test_series_astype_error_error(session): +def test_series_astype_w_invalid_error(session): input = pd.Series(["hello", "world", "3.11", "4000"]) with pytest.raises(ValueError): session.read_pandas(input).astype("Float64", errors="bad_value") @@ -3676,6 +3680,119 @@ def test_timestamp_astype_string(): assert bf_result.dtype == "string[pyarrow]" +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_float_astype_json(errors): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_result.index = expected_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_string_astype_json(errors): + data = [ + "1", + None, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', + ] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_string_astype_json_in_safe_mode(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +def test_string_astype_json_raise_error(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + with pytest.raises( + google.api_core.exceptions.BadRequest, + match="syntax error while parsing value", + ): + bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas() + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"), + pytest.param( + ['"str"', None], + dtypes.TIME_DTYPE, + id="invalid", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_json_astype_others(data, to_type, errors): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + + bf_result = bf_series.astype(to_type, errors=errors) + assert bf_result.dtype == to_type + + load_data = [json.loads(item) if item is not None else None for item in data] + expected = pd.Series(load_data, dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_raise_error(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + with pytest.raises(google.api_core.exceptions.BadRequest): + bf_series.astype(to_type, errors="raise").to_pandas() + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_in_safe_mode(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + bf_result = bf_series.astype(to_type, errors="null") + assert bf_result.dtype == to_type + + expected = pd.Series([None, None], dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + @pytest.mark.parametrize( "index", [0, 5, -2], @@ -3687,9 +3804,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): assert bf_result == pd_result -def test_iloc_single_integer_out_of_bound_error( - scalars_df_index, scalars_pandas_df_index -): +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): scalars_df_index.string_col.iloc[99] diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index 6e98d6a9e1..acccd7ea6c 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -1222,6 +1222,12 @@ def __sql_name__(self, op: ops.ScalarUDF | ops.AggUDF) -> str: # not actually a table, but easier to quote individual namespace # components this way namespace = op.__udf_namespace__ + + # Function names prefixed with "SAFE.", such as `SAFE.PARSE_JSON`, + # are typically not quoted. + if funcname.startswith("SAFE."): + return funcname + return sg.table(funcname, db=namespace.database, catalog=namespace.catalog).sql( self.dialect ) From df24c842b7de9cca10e9d676405b258facf31225 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 12 May 2025 16:57:31 -0700 Subject: [PATCH 04/52] chore: add code samples for Data Manipulation public doc (#1722) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: add code samples for Data Manipulation public doc * fix sample format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- samples/snippets/bigquery_modules_test.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py index 1a15790815..0cc2b1d8b5 100644 --- a/samples/snippets/bigquery_modules_test.py +++ b/samples/snippets/bigquery_modules_test.py @@ -14,6 +14,19 @@ def test_bigquery_dataframes_examples() -> None: + # [START bigquery_dataframes_bigquery_methods_array_agg] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + s = bpd.Series([0, 1, 2, 3, 4, 5]) + + # Group values by whether they are divisble by 2 and aggregate them into arrays + bbq.array_agg(s.groupby(s % 2 == 0)) + # False [1 3 5] + # True [0 2 4] + # dtype: list[pyarrow] + # [END bigquery_dataframes_bigquery_methods_array_agg] + # [START bigquery_dataframes_bigquery_methods_struct] import bigframes.bigquery as bbq import bigframes.pandas as bpd @@ -36,6 +49,22 @@ def test_bigquery_dataframes_examples() -> None: # dtype: struct[pyarrow] # [END bigquery_dataframes_bigquery_methods_struct] + # [START bigquery_dataframes_bigquery_methods_unix_micros] + import pandas as pd + + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Create a series that consists of three timestamps: [1970-01-01, 1970-01-02, 1970-01-03] + s = bpd.Series(pd.date_range("1970-01-01", periods=3, freq="d", tz="UTC")) + + bbq.unix_micros(s) + # 0 0 + # 1 86400000000 + # 2 172800000000 + # dtype: Int64 + # [END bigquery_dataframes_bigquery_methods_unix_micros] + # [START bigquery_dataframes_bigquery_methods_scalar] import bigframes.bigquery as bbq import bigframes.pandas as bpd From 55c07e9d4315949c37ffa3e03c8fedc6daf17faf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 13 May 2025 14:59:15 -0500 Subject: [PATCH 05/52] fix: stop ignoring arguments to `MatrixFactorization.score(X, y)` (#1726) * fix: stop ignoring arguments to `MatrixFactorization.score(X, y)` * fix unit tests --- bigframes/ml/decomposition.py | 11 ++++++++-- tests/system/large/ml/test_decomposition.py | 12 +++++++++- tests/unit/ml/test_golden_sql.py | 22 ++++++++++++++++--- .../sklearn/decomposition/_mf.py | 10 +++++---- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index ece950a5a2..3ff32d2433 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -360,5 +360,12 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE. - return self._bqml_model.evaluate() + if X is not None and y is not None: + X, y = utils.batch_convert_to_dataframe( + X, y, session=self._bqml_model.session + ) + input_data = X.join(y, how="outer") + else: + input_data = X + + return self._bqml_model.evaluate(input_data) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index d1a5f9f2aa..e0e4b79c6f 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -13,6 +13,7 @@ # limitations under the License. import pandas as pd +import pandas.testing from bigframes.ml import decomposition from tests.system import utils @@ -193,7 +194,16 @@ def test_decomposition_mf_configure_fit_load( ) ) - reloaded_model.score(new_ratings) + # Make sure the input to score is not ignored. + scores_training_data = reloaded_model.score().to_pandas() + scores_new_ratings = reloaded_model.score(new_ratings).to_pandas() + pandas.testing.assert_index_equal( + scores_training_data.columns, scores_new_ratings.columns + ) + assert ( + scores_training_data["mean_squared_error"].iloc[0] + != scores_new_ratings["mean_squared_error"].iloc[0] + ) result = reloaded_model.predict(new_ratings).to_pandas() diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 62cfe09704..10fefcc457 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -81,6 +81,7 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + type(mock_X).sql = mock.PropertyMock(return_value="input_X_sql_property") mock_X.reset_index(drop=True).cache().sql = "input_X_no_index_sql" mock_X.join(mock_y).sql = "input_X_y_sql" mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) @@ -248,7 +249,7 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): ) -def test_decomposition_mf_score(mock_session, bqml_model, mock_X): +def test_decomposition_mf_score(mock_session, bqml_model): model = decomposition.MatrixFactorization( num_factors=34, feedback_type="explicit", @@ -258,8 +259,23 @@ def test_decomposition_mf_score(mock_session, bqml_model, mock_X): l2_reg=9.83, ) model._bqml_model = bqml_model - model.score(mock_X) - + model.score() mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)" ) + + +def test_decomposition_mf_score_with_x(mock_session, bqml_model, mock_X): + model = decomposition.MatrixFactorization( + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model = bqml_model + model.score(mock_X) + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql_property))" + ) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index fb29cc8984..c3c3a77b71 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -73,11 +73,13 @@ def score(self, X=None, y=None): for the outputs relevant to this model type. Args: - X (default None): - Ignored. + X (bigframes.dataframe.DataFrame | bigframes.series.Series | None): + DataFrame of shape (n_samples, n_features). Test samples. + + y (bigframes.dataframe.DataFrame | bigframes.series.Series | None): + DataFrame of shape (n_samples,) or (n_samples, n_outputs). True + labels for `X`. - y (default None): - Ignored. Returns: bigframes.dataframe.DataFrame: DataFrame that represents model metrics. """ From de9efdb62205f486162ebfb80a6cd7b7d2d5c653 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 13 May 2025 17:19:43 -0700 Subject: [PATCH 06/52] test: add gemini-2.0-flash-lite-011 test (#1729) --- bigframes/ml/llm.py | 5 +++-- tests/system/load/test_llm.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index ef74c4ac55..591d18e3b5 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -529,7 +529,8 @@ def fit( y: utils.ArrayType, ) -> GeminiTextGenerator: """Fine tune GeminiTextGenerator model. Only support "gemini-1.5-pro-002", - and "gemini-1.5-flash-002" models for now. + "gemini-1.5-flash-002", "gemini-2.0-flash-001", + and "gemini-2.0-flash-lite-001"models for now. .. note:: @@ -549,7 +550,7 @@ def fit( """ if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: msg = exceptions.format_message( - "fit() only supports gemini-1.5-pro-002, or gemini-1.5-flash-002 model." + "fit() only supports gemini-1.5-pro-002, gemini-1.5-flash-002, gemini-2.0-flash-001, or gemini-2.0-flash-lite-001 model." ) warnings.warn(msg) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index d346d109be..49f79d9d44 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -44,6 +44,7 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df): "gemini-1.5-pro-002", "gemini-1.5-flash-002", "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) def test_llm_gemini_configure_fit( From 6c5337813c1552af5c8a39feedfd074202052df7 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 13 May 2025 18:09:03 -0700 Subject: [PATCH 07/52] chore: disable semantic sys tests and doc tests (#1730) --- bigframes/operations/semantics.py | 28 +++++++++---------- .../system/large/operations/test_semantics.py | 5 ++++ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 8c5c54e8ca..60d619992a 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -57,7 +57,7 @@ def agg( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame( ... { @@ -68,7 +68,7 @@ def agg( ... ], ... "Year": [1997, 2013, 2010], ... }) - >>> df.semantics.agg( + >>> df.semantics.agg( # doctest: +SKIP ... "Find the first name shared by all actors in {Movies}. One word answer.", ... model=model, ... ) @@ -326,10 +326,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) - >>> df.semantics.filter("{city} is the capital of {country}", model) + >>> df.semantics.filter("{city} is the capital of {country}", model) # doctest: +SKIP country city 1 Germany Berlin @@ -440,10 +440,10 @@ def map( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) - >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) # doctest: +SKIP ingredient_1 ingredient_2 food 0 Burger Bun Beef Patty Burger @@ -563,12 +563,12 @@ def join( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) - >>> cities.semantics.join(continents, "{city} is in {continent}", model) + >>> cities.semantics.join(continents, "{city} is in {continent}", model) # doctest: +SKIP city continent 0 Seattle North America 1 Ottawa North America @@ -704,10 +704,10 @@ def search( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") # doctest: +SKIP >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) - >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') + >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') # doctest: +SKIP creatures distance 3 chimpanzee 0.635844 @@ -805,14 +805,14 @@ def top_k( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame( ... { ... "Animals": ["Dog", "Bird", "Cat", "Horse"], ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], ... }) - >>> df.semantics.top_k("{Animals} are more popular as pets", model=model, k=2) + >>> df.semantics.top_k("{Animals} are more popular as pets", model=model, k=2) # doctest: +SKIP Animals Sounds 0 Dog Woof 2 Cat Meow @@ -1006,12 +1006,12 @@ def sim_join( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") # doctest: +SKIP >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) - >>> df1.semantics.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + >>> df1.semantics.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) # doctest: +SKIP animal animal_1 0 monkey baboon 1 spider scorpion diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index c3f08c6204..3517b1adbc 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -22,6 +22,11 @@ import bigframes from bigframes import dataframe, dtypes, exceptions, series +pytest.skip( + "Semantics namespace is deprecated. ", + allow_module_level=True, +) + SEM_OP_EXP_OPTION = "experiments.semantic_operators" BLOB_EXP_OPTION = "experiments.blob" THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" From b2261cc07cd58b51d212f9bf495c5022e587f816 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 13 May 2025 18:51:27 -0700 Subject: [PATCH 08/52] docs: add llm output_schema notebook (#1732) * docs: add llm output_schema notebook * add --- .../bq_dataframes_llm_output_schema.ipynb | 770 ++++++++++++++++++ .../multimodal/multimodal_dataframe.ipynb | 2 +- 2 files changed, 771 insertions(+), 1 deletion(-) create mode 100644 notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb diff --git a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb new file mode 100644 index 0000000000..0efac1eee3 --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb @@ -0,0 +1,770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigFrames LLM Output Schema\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This Notebook introduces BigFrames LLM with output schema to generate structured output dataframes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd\n", + "from bigframes.ml import llm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a BigFrames DataFrame and a Gemini model\n", + "Starting from creating a simple dataframe of several cities and a Gemini model in BigFrames" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city
0Seattle
1New York
2Shanghai
\n", + "

3 rows × 1 columns

\n", + "
[3 rows x 1 columns in total]" + ], + "text/plain": [ + " city\n", + "0 Seattle\n", + "1 New York\n", + "2 Shanghai\n", + "\n", + "[3 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\"city\": [\"Seattle\", \"New York\", \"Shanghai\"]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n" + ] + } + ], + "source": [ + "gemini = llm.GeminiTextGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Generate structured output data\n", + "Before, llm models can only generate text output. Saying if you want to know whether the city is a US city, for example:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityml_generate_text_llm_result
0SeattleYes, Seattle is a city in the United States. I...
1New YorkYes, New York City is a city in the United Sta...
2ShanghaiNo, Shanghai is not a US city. It is a major c...
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city ml_generate_text_llm_result\n", + "0 Seattle Yes, Seattle is a city in the United States. I...\n", + "1 New York Yes, New York City is a city in the United Sta...\n", + "2 Shanghai No, Shanghai is not a US city. It is a major c...\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"], \"is a US city?\"])\n", + "result[[\"city\", \"ml_generate_text_llm_result\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The outputs are text results that human can read. But if want the output data to be more useful for analysis, it is better to transfer to structured data like boolean, int or float values. Usually the process wasn't easy.\n", + "\n", + "Now you can get structured output out-of-the-box by specifying the output_schema parameter in Gemini model predict method. In below example, the outputs are only boolean values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_us_city
0SeattleTrue
1New YorkTrue
2ShanghaiFalse
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city is_us_city\n", + "0 Seattle True\n", + "1 New York True\n", + "2 Shanghai False\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"], \"is a US city?\"], output_schema={\"is_us_city\": \"bool\"})\n", + "result[[\"city\", \"is_us_city\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get float or int values, for example, to get polulations in millions:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
citypolulation_million
0Seattle0.75
1New York19.68
2Shanghai26.32
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city polulation_million\n", + "0 Seattle 0.75\n", + "1 New York 19.68\n", + "2 Shanghai 26.32\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[\"what is the population in millions of\", df[\"city\"]], output_schema={\"polulation_million\": \"float64\"})\n", + "result[[\"city\", \"polulation_million\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And yearly rainy days:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityrainy_days
0Seattle152
1New York123
2Shanghai123
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city rainy_days\n", + "0 Seattle 152\n", + "1 New York 123\n", + "2 Shanghai 123\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[\"how many rainy days per year in\", df[\"city\"]], output_schema={\"rainy_days\": \"int64\"})\n", + "result[[\"city\", \"rainy_days\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Generate all types of data in one prediction\n", + "You can get the different output columns and types in one prediction. \n", + "\n", + "Note it doesn't require dedicated prompts, as long as the output column names are informative to the model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_US_citypolulation_in_millionsrainy_days_per_year
0SeattleTrue0.75152
1New YorkTrue8.8121
2ShanghaiFalse26.32115
\n", + "

3 rows × 4 columns

\n", + "
[3 rows x 4 columns in total]" + ], + "text/plain": [ + " city is_US_city polulation_in_millions rainy_days_per_year\n", + "0 Seattle True 0.75 152\n", + "1 New York True 8.8 121\n", + "2 Shanghai False 26.32 115\n", + "\n", + "[3 rows x 4 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"polulation_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\"})\n", + "result[[\"city\", \"is_US_city\", \"polulation_in_millions\", \"rainy_days_per_year\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Generate composite data types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Composite datatypes like array and struct can also be generated. Here the example generates a places_to_visit column as array of strings and a gps_coordinates as struct of floats. Along with previous fields, all in one prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_US_citypolulation_in_millionsrainy_days_per_yearplaces_to_visitgps_coordinates
0SeattleTrue0.74150['Space Needle' 'Pike Place Market' 'Museum of...{'latitude': 47.6062, 'longitude': -122.3321}
1New YorkTrue8.4121['Times Square' 'Central Park' 'Statue of Libe...{'latitude': 40.7128, 'longitude': -74.006}
2ShanghaiFalse26.32115['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori...{'latitude': 31.2304, 'longitude': 121.4737}
\n", + "

3 rows × 6 columns

\n", + "
[3 rows x 6 columns in total]" + ], + "text/plain": [ + " city is_US_city polulation_in_millions rainy_days_per_year \\\n", + "0 Seattle True 0.74 150 \n", + "1 New York True 8.4 121 \n", + "2 Shanghai False 26.32 115 \n", + "\n", + " places_to_visit \\\n", + "0 ['Space Needle' 'Pike Place Market' 'Museum of... \n", + "1 ['Times Square' 'Central Park' 'Statue of Libe... \n", + "2 ['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori... \n", + "\n", + " gps_coordinates \n", + "0 {'latitude': 47.6062, 'longitude': -122.3321} \n", + "1 {'latitude': 40.7128, 'longitude': -74.006} \n", + "2 {'latitude': 31.2304, 'longitude': 121.4737} \n", + "\n", + "[3 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"polulation_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\", \"places_to_visit\": \"array\", \"gps_coordinates\": \"struct\"})\n", + "result[[\"city\", \"is_US_city\", \"polulation_in_millions\", \"rainy_days_per_year\", \"places_to_visit\", \"gps_coordinates\"]]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index ce3f10b881..b7d713c342 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -55,7 +55,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook is introducing BigFrames experimental Multimodal features:\n", + "This notebook is introducing BigFrames Multimodal features:\n", "1. Create Multimodal DataFrame\n", "2. Combine unstructured data with structured data\n", "3. Conduct image transformations\n", From 476b7dd7c2639cb6804272d06aa5c1db666819da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 14 May 2025 12:31:29 -0500 Subject: [PATCH 09/52] docs: use partial ordering mode in the quickstart sample (#1734) --- samples/snippets/quickstart.py | 19 ++++++++++--------- samples/snippets/quickstart_test.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index c26c6f4442..adc85fa92d 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -14,16 +14,7 @@ def run_quickstart(project_id: str) -> None: - import bigframes - - session_options = bigframes.BigQueryOptions() - session = bigframes.connect(session_options) - your_gcp_project_id = project_id - query_or_table = "bigquery-public-data.ml_datasets.penguins" - df_session = session.read_gbq(query_or_table) - average_body_mass = df_session["body_mass_g"].mean() - print(f"average_body_mass (df_session): {average_body_mass}") # [START bigquery_bigframes_quickstart] import bigframes.pandas as bpd @@ -33,10 +24,20 @@ def run_quickstart(project_id: str) -> None: # On BigQuery Studio, the project ID is automatically detected. bpd.options.bigquery.project = your_gcp_project_id + # Use "partial" ordering mode to generate more efficient queries, but the + # order of the rows in DataFrames may not be deterministic if you have not + # explictly sorted it. Some operations that depend on the order, such as + # head() will not function until you explictly order the DataFrame. Set the + # ordering mode to "strict" (default) for more pandas compatibility. + bpd.options.bigquery.ordering_mode = "partial" + # Create a DataFrame from a BigQuery table query_or_table = "bigquery-public-data.ml_datasets.penguins" df = bpd.read_gbq(query_or_table) + # Efficiently preview the results using the .peek() method. + df.peek() + # Use the DataFrame just as you would a pandas DataFrame, but calculations # happen in the BigQuery query engine instead of the local system. average_body_mass = df["body_mass_g"].mean() diff --git a/samples/snippets/quickstart_test.py b/samples/snippets/quickstart_test.py index 4abc87d011..a650f8365d 100644 --- a/samples/snippets/quickstart_test.py +++ b/samples/snippets/quickstart_test.py @@ -33,4 +33,4 @@ def test_quickstart( quickstart.run_quickstart(your_project_id) out, _ = capsys.readouterr() - assert "average_body_mass (df_session):" in out + assert "average_body_mass:" in out From 108f4d259e1bcfbe6c7aa3c3c3f8f605cf7615ee Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 15 May 2025 13:27:23 -0700 Subject: [PATCH 10/52] feat: Use read api for some peek ops (#1731) --- bigframes/session/read_api_execution.py | 32 ++++++++++++++++++++----- tests/system/large/test_dataframe_io.py | 14 +++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index 46d55eb303..9384a40fbe 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -18,7 +18,7 @@ from google.cloud import bigquery_storage_v1 import pyarrow as pa -from bigframes.core import bigframe_node, rewrite +from bigframes.core import bigframe_node, nodes, pyarrow_utils, rewrite from bigframes.session import executor, semi_executor @@ -39,14 +39,11 @@ def execute( ordered: bool, peek: Optional[int] = None, ) -> Optional[executor.ExecuteResult]: - node = rewrite.try_reduce_to_table_scan(plan) + node = self._try_adapt_plan(plan, ordered) if not node: return None if node.explicitly_ordered and ordered: return None - if peek: - # TODO: Support peeking - return None import google.cloud.bigquery_storage_v1.types as bq_storage_types from google.protobuf import timestamp_pb2 @@ -92,16 +89,39 @@ def execute( def process_page(page): pa_batch = page.to_arrow() + pa_batch = pa_batch.select( + [item.source_id for item in node.scan_list.items] + ) return pa.RecordBatch.from_arrays( pa_batch.columns, names=[id.sql for id in node.ids] ) batches = map(process_page, rowstream.pages) + if peek: + batches = pyarrow_utils.truncate_pyarrow_iterable(batches, max_results=peek) + + rows = node.source.n_rows + if peek and rows: + rows = min(peek, rows) + return executor.ExecuteResult( arrow_batches=batches, schema=plan.schema, query_job=None, total_bytes=None, - total_rows=node.source.n_rows, + total_rows=rows, ) + + def _try_adapt_plan( + self, + plan: bigframe_node.BigFrameNode, + ordered: bool, + ) -> Optional[nodes.ReadTableNode]: + """ + Tries to simplify the plan to an equivalent single ReadTableNode. Otherwise, returns None. + """ + if not ordered: + # gets rid of order_by ops + plan = rewrite.bake_order(plan) + return rewrite.try_reduce_to_table_scan(plan) diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py index b10e361129..87d2acd34b 100644 --- a/tests/system/large/test_dataframe_io.py +++ b/tests/system/large/test_dataframe_io.py @@ -29,6 +29,20 @@ def test_to_pandas_batches_raise_when_large_result_not_allowed(session): next(df.to_pandas_batches(page_size=500, max_results=1500)) +def test_large_df_peek_no_job(session): + execution_count_before = session._metrics.execution_count + + # only works with null index, as sequential index requires row_number over full table scan. + df = session.read_gbq( + WIKIPEDIA_TABLE, index_col=bigframes.enums.DefaultIndexKind.NULL + ) + result = df.peek(50) + execution_count_after = session._metrics.execution_count + + assert len(result) == 50 + assert execution_count_after == execution_count_before + + def test_to_pandas_batches_override_global_option( session, ): From f8d2cd24281415f4a8f9193b676f5483128cd173 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 15 May 2025 14:23:40 -0700 Subject: [PATCH 11/52] fix: Support str.replace re.compile with flags (#1736) --- bigframes/operations/strings.py | 28 +++++++++++++------ tests/system/small/operations/test_strings.py | 1 + 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index a8430b0b0e..9022a1665e 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -15,7 +15,7 @@ from __future__ import annotations import re -from typing import cast, Literal, Optional, Union +from typing import Literal, Optional, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr @@ -230,21 +230,26 @@ def replace( flags: int = 0, regex: bool = False, ) -> series.Series: - is_compiled = isinstance(pat, re.Pattern) - patstr = cast(str, pat.pattern if is_compiled else pat) # type: ignore + if isinstance(pat, re.Pattern): + assert isinstance(pat.pattern, str) + pat_str = pat.pattern + flags = pat.flags | flags + else: + pat_str = pat + if case is False: - return self.replace(pat, repl, flags=flags | re.IGNORECASE, regex=True) + return self.replace(pat_str, repl, flags=flags | re.IGNORECASE, regex=True) if regex: re2flags = _parse_flags(flags) if re2flags: - patstr = re2flags + patstr - return self._apply_unary_op(ops.RegexReplaceStrOp(pat=patstr, repl=repl)) + pat_str = re2flags + pat_str + return self._apply_unary_op(ops.RegexReplaceStrOp(pat=pat_str, repl=repl)) else: - if is_compiled: + if isinstance(pat, re.Pattern): raise ValueError( "Must set 'regex'=True if using compiled regex pattern." ) - return self._apply_unary_op(ops.ReplaceStrOp(pat=patstr, repl=repl)) + return self._apply_unary_op(ops.ReplaceStrOp(pat=pat_str, repl=repl)) def startswith( self, @@ -318,10 +323,15 @@ def to_blob(self, connection: Optional[str] = None) -> series.Series: def _parse_flags(flags: int) -> Optional[str]: re2flags = [] for reflag, re2flag in REGEXP_FLAGS.items(): - if flags & flags: + if flags & reflag: re2flags.append(re2flag) flags = flags ^ reflag + # re2 handles unicode fine by default + # most compiled re in python will have unicode set + if re.U and flags: + flags = flags ^ re.U + # Remaining flags couldn't be mapped to re2 engine if flags: raise NotImplementedError( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index e4824875b4..032d93c19d 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -98,6 +98,7 @@ def test_str_extract(scalars_dfs, pat): (re.compile("(?i).e.."), "blah", None, 0, True), ("H", "h", True, 0, False), (", ", "__", True, 0, False), + (re.compile(r"hEllo", flags=re.I), "blah", None, 0, True), ], ) def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): From 7d93b35088d9305d0589e4154ed39c7dd44a8f7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 15 May 2025 17:29:57 -0500 Subject: [PATCH 12/52] chore: enable bigframes objects in _read_gbq_colab pyformat arg (#1727) * chore: enable bigframes objects in _read_gbq_colab pyformat arg * use views for bigframes dataframe * move temp view logic to session * revert changes to block * revert more unnecessary view changes --- bigframes/core/blocks.py | 12 ++++++++ bigframes/core/pyformat.py | 21 ++++++++++---- bigframes/dataframe.py | 17 +++++++++-- bigframes/session/__init__.py | 5 +++- bigframes/session/_io/bigquery/__init__.py | 22 +++++++++++++++ bigframes/session/anonymous_dataset.py | 24 ++++++++++++++-- .../small/session/test_read_gbq_colab.py | 28 +++++++++++++++++++ tests/unit/session/test_read_gbq_colab.py | 28 ++++++++++++++----- 8 files changed, 137 insertions(+), 20 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ccb2ffe401..a3a2ac36f5 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -153,6 +153,7 @@ def __init__( self._stats_cache[" ".join(self.index_columns)] = {} self._transpose_cache: Optional[Block] = transpose_cache + self._view_ref: Optional[bigquery.TableReference] = None @classmethod def from_local( @@ -2487,6 +2488,17 @@ def to_sql_query( idx_labels, ) + def to_view(self, include_index: bool) -> bigquery.TableReference: + """ + Creates a temporary BigQuery VIEW with the SQL corresponding to this block. + """ + if self._view_ref is not None: + return self._view_ref + + sql, _, _ = self.to_sql_query(include_index=include_index) + self._view_ref = self.session._create_temp_view(sql) + return self._view_ref + def cached(self, *, force: bool = False, session_aware: bool = False) -> None: """Write the block to a session table.""" # use a heuristic for whether something needs to be cached diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 98f175d300..59ccdf1f5f 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -37,9 +37,13 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" -def _field_to_template_value(name: str, value: Any) -> str: +def _field_to_template_value( + name: str, + value: Any, +) -> str: """Convert value to something embeddable in a SQL string.""" import bigframes.core.sql # Avoid circular imports + import bigframes.dataframe # Avoid circular imports _validate_type(name, value) @@ -47,20 +51,27 @@ def _field_to_template_value(name: str, value: Any) -> str: if isinstance(value, table_types): return _table_to_sql(value) - # TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery. + # TODO(tswast): convert pandas DataFrame objects to gbq tables or a literals subquery. + if isinstance(value, bigframes.dataframe.DataFrame): + return _table_to_sql(value._to_view()) + return bigframes.core.sql.simple_literal(value) def _validate_type(name: str, value: Any): """Raises TypeError if value is unsupported.""" import bigframes.core.sql # Avoid circular imports + import bigframes.dataframe # Avoid circular imports if value is None: return # None can't be used in isinstance, but is a valid literal. - supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args( - bigframes.core.sql.SIMPLE_LITERAL_TYPES + supported_types = ( + typing.get_args(_BQ_TABLE_TYPES) + + typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES) + + (bigframes.dataframe.DataFrame,) ) + if not isinstance(value, supported_types): raise TypeError( f"{name} has unsupported type: {type(value)}. " @@ -80,8 +91,6 @@ def pyformat( sql_template: str, *, pyformat_args: dict, - # TODO: add dry_run parameter to avoid expensive API calls in conversion - # TODO: and session to upload data / convert to table if necessary ) -> str: """Unsafe Python-style string formatting of SQL string. diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8ed749138c..a98733b48a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -394,6 +394,19 @@ def astype( return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) + def _should_sql_have_index(self) -> bool: + """Should the SQL we pass to BQML and other I/O include the index?""" + + return self._has_index and ( + self.index.name is not None or len(self.index.names) > 1 + ) + + def _to_view(self) -> bigquery.TableReference: + """Compiles this DataFrame's expression tree to SQL and saves it to a + (temporary) view. + """ + return self._block.to_view(include_index=self._should_sql_have_index()) + def _to_sql_query( self, include_index: bool, enable_cache: bool = True ) -> Tuple[str, list[str], list[blocks.Label]]: @@ -420,9 +433,7 @@ def sql(self) -> str: string representing the compiled SQL. """ try: - include_index = self._has_index and ( - self.index.name is not None or len(self.index.names) > 1 - ) + include_index = self._should_sql_have_index() sql, _, _ = self._to_sql_query(include_index=include_index) return sql except AttributeError as e: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 81359ebb36..7630e71eaa 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -527,7 +527,6 @@ def _read_gbq_colab( query = bigframes.core.pyformat.pyformat( query, pyformat_args=pyformat_args, - # TODO: add dry_run parameter to avoid API calls for data in pyformat_args ) return self._loader.read_gbq_query( @@ -1938,6 +1937,10 @@ def _create_object_table(self, path: str, connection: str) -> str: return table + def _create_temp_view(self, sql: str) -> bigquery.TableReference: + """Create a random id Object Table from the input path and connection.""" + return self._anon_dataset_manager.create_temp_view(sql) + def from_glob_path( self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None ) -> dataframe.DataFrame: diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index c08bb8d0dc..267111afe0 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -139,6 +139,28 @@ def create_temp_table( return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" +def create_temp_view( + bqclient: bigquery.Client, + table_ref: bigquery.TableReference, + *, + expiration: datetime.datetime, + sql: str, +) -> str: + """Create an empty table with an expiration in the desired session. + + The table will be deleted when the session is closed or the expiration + is reached. + """ + destination = bigquery.Table(table_ref) + destination.expires = expiration + destination.view_query = sql + + # Ok if already exists, since this will only happen from retries internal to this method + # as the requested table id has a random UUID4 component. + bqclient.create_table(destination, exists_ok=True) + return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" + + def set_table_expiration( bqclient: bigquery.Client, table_ref: bigquery.TableReference, diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index c8980e159b..bc785f693f 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -53,6 +53,12 @@ def __init__( def location(self): return self._location + def _default_expiration(self): + """When should the table expire automatically?""" + return ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) + def create_temp_table( self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] ) -> bigquery.TableReference: @@ -60,9 +66,7 @@ def create_temp_table( Allocates and and creates a table in the anonymous dataset. The table will be cleaned up by clean_up_tables. """ - expiration = ( - datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION - ) + expiration = self._default_expiration() table = bf_io_bigquery.create_temp_table( self.bqclient, self.allocate_temp_table(), @@ -73,6 +77,20 @@ def create_temp_table( ) return bigquery.TableReference.from_string(table) + def create_temp_view(self, sql: str) -> bigquery.TableReference: + """ + Allocates and and creates a view in the anonymous dataset. + The view will be cleaned up by clean_up_tables. + """ + expiration = self._default_expiration() + table = bf_io_bigquery.create_temp_view( + self.bqclient, + self.allocate_temp_table(), + expiration=expiration, + sql=sql, + ) + return bigquery.TableReference.from_string(table) + def allocate_temp_table(self) -> bigquery.TableReference: """ Allocates a unique table id, but does not create the table. diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 00ce0c722b..946faffab2 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -73,3 +73,31 @@ def test_read_gbq_colab_includes_formatted_scalars(session): } ), ) + + +def test_read_gbq_colab_includes_formatted_bigframes_dataframe( + session, scalars_df_index, scalars_pandas_df_index +): + pyformat_args = { + # Apply some operations to make sure the columns aren't renamed. + "some_dataframe": scalars_df_index[scalars_df_index["int64_col"] > 0].assign( + int64_col=scalars_df_index["int64_too"] + ), + # This is not a supported type, but ignored if not referenced. + "some_object": object(), + } + df = session._read_gbq_colab( + """ + SELECT int64_col, rowindex + FROM {some_dataframe} + ORDER BY rowindex ASC + """, + pyformat_args=pyformat_args, + ) + result = df.to_pandas() + expected = ( + scalars_pandas_df_index[scalars_pandas_df_index["int64_col"] > 0] + .assign(int64_col=scalars_pandas_df_index["int64_too"]) + .reset_index(drop=False)[["int64_col", "rowindex"]] + ) + pandas.testing.assert_frame_equal(result, expected) diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index 9afdba9eb3..cffc6b3af7 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -14,6 +14,10 @@ """Unit tests for read_gbq_colab helper functions.""" +import textwrap + +from google.cloud import bigquery + from bigframes.testing import mocks @@ -32,29 +36,39 @@ def test_read_gbq_colab_includes_label(): assert "session-read_gbq_colab" in label_values -def test_read_gbq_colab_includes_formatted_values_in_dry_run(): +def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch): session = mocks.create_bigquery_session() + bf_df = mocks.create_dataframe(monkeypatch, session=session) + bf_df._to_view = lambda: bigquery.TableReference.from_string("my-project.my_dataset.some_view") # type: ignore pyformat_args = { "some_integer": 123, "some_string": "This could be dangerous, but we escape it", + "bf_df": bf_df, # This is not a supported type, but ignored if not referenced. "some_object": object(), } + _ = session._read_gbq_colab( - """ - SELECT {some_integer} as some_integer, - {some_string} as some_string, - '{{escaped}}' as escaped - """, + textwrap.dedent( + """ + SELECT {some_integer} as some_integer, + {some_string} as some_string, + '{{escaped}}' as escaped + FROM {bf_df} + """ + ), pyformat_args=pyformat_args, dry_run=True, ) - expected = """ + expected = textwrap.dedent( + """ SELECT 123 as some_integer, 'This could be dangerous, but we escape it' as some_string, '{escaped}' as escaped + FROM `my-project`.`my_dataset`.`some_view` """ + ) queries = session._queries # type: ignore configs = session._job_configs # type: ignore From d937be04c4b2a0dfe6ef5b2d321cac61e8967fba Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 15 May 2025 17:32:03 -0700 Subject: [PATCH 13/52] test: fix snippets test of using partial ordering mode (#1741) * test: fix snippets test of using partial ordering mode * reset option --- samples/snippets/quickstart.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index adc85fa92d..8b3b08884a 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -71,3 +71,7 @@ def run_quickstart(project_id: str) -> None: model.fit(X, y) model.score(X, y) # [END bigquery_bigframes_quickstart] + + # close session and reset option so not to affect other tests + bpd.close_session() + bpd.options.bigquery.ordering_mode = "strict" From 545cdcac1361607678c2574f0f31eb43950073e5 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 16 May 2025 10:47:55 -0700 Subject: [PATCH 14/52] fix: reduce bigquery table modification via DML for to_gbq (#1737) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid exceeding BigQuery's 1500 daily table modification limit, to_gbq now prioritizes INSERT or MERGE DMLs. This method is used when the target table exists and shares the same schema, supporting both data replacement and appending. If schema discrepancies are found, to_gbq will default back to its original table modification process. Fixes internal issue 409086472 Co-authored-by: Tim Sweña (Swast) --- bigframes/core/compile/sqlglot/sqlglot_ir.py | 57 +++++++++ bigframes/session/bq_caching_executor.py | 65 ++++++++-- tests/system/small/test_dataframe_io.py | 122 ++++++++++++------- tests/unit/test_dataframe_io.py | 5 + 4 files changed, 198 insertions(+), 51 deletions(-) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index b23349bcbc..935ad393f8 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -17,6 +17,7 @@ import dataclasses import typing +from google.cloud import bigquery import pyarrow as pa import sqlglot as sg import sqlglot.dialects.bigquery @@ -104,6 +105,24 @@ def from_pyarrow( ) return cls(expr=sg.select(sge.Star()).from_(expr), uid_gen=uid_gen) + @classmethod + def from_query_string( + cls, + query_string: str, + ) -> SQLGlotIR: + """Builds SQLGlot expression from a query string""" + uid_gen: guid.SequentialUIDGenerator = guid.SequentialUIDGenerator() + cte_name = sge.to_identifier( + next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted + ) + cte = sge.CTE( + this=query_string, + alias=cte_name, + ) + select_expr = sge.Select().select(sge.Star()).from_(sge.Table(this=cte_name)) + select_expr.set("with", sge.With(expressions=[cte])) + return cls(expr=select_expr, uid_gen=uid_gen) + def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], @@ -133,6 +152,36 @@ def project( select_expr = self.expr.select(*projected_cols_expr, append=True) return SQLGlotIR(expr=select_expr) + def insert( + self, + destination: bigquery.TableReference, + ) -> str: + return sge.insert(self.expr.subquery(), _table(destination)).sql( + dialect=self.dialect, pretty=self.pretty + ) + + def replace( + self, + destination: bigquery.TableReference, + ) -> str: + # Workaround for SQLGlot breaking change: + # https://github.com/tobymao/sqlglot/pull/4495 + whens_expr = [ + sge.When(matched=False, source=True, then=sge.Delete()), + sge.When(matched=False, then=sge.Insert(this=sge.Var(this="ROW"))), + ] + whens_str = "\n".join( + when_expr.sql(dialect=self.dialect, pretty=self.pretty) + for when_expr in whens_expr + ) + + merge_str = sge.Merge( + this=_table(destination), + using=self.expr.subquery(), + on=_literal(False, dtypes.BOOL_DTYPE), + ).sql(dialect=self.dialect, pretty=self.pretty) + return f"{merge_str}\n{whens_str}" + def _encapsulate_as_cte( self, ) -> sge.Select: @@ -190,3 +239,11 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: def _cast(arg: typing.Any, to: str) -> sge.Cast: return sge.Cast(this=arg, to=to) + + +def _table(table: bigquery.TableReference) -> sge.Table: + return sge.Table( + this=sg.to_identifier(table.table_id, quoted=True), + db=sg.to_identifier(table.dataset_id, quoted=True), + catalog=sg.to_identifier(table.project, quoted=True), + ) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 6614abfed2..118838c059 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -29,9 +29,11 @@ import bigframes.core from bigframes.core import compile, rewrite +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.guid import bigframes.core.nodes as nodes import bigframes.core.ordering as order +import bigframes.core.schema as schemata import bigframes.core.tree_properties as tree_properties import bigframes.dtypes import bigframes.exceptions as bfe @@ -206,17 +208,45 @@ def export_gbq( if bigframes.options.compute.enable_multi_query_execution: self._simplify_with_caching(array_value) - dispositions = { - "fail": bigquery.WriteDisposition.WRITE_EMPTY, - "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, - "append": bigquery.WriteDisposition.WRITE_APPEND, - } + table_exists = True + try: + table = self.bqclient.get_table(destination) + if if_exists == "fail": + raise ValueError(f"Table already exists: {destination.__str__()}") + except google.api_core.exceptions.NotFound: + table_exists = False + + if len(cluster_cols) != 0: + if table_exists and table.clustering_fields != cluster_cols: + raise ValueError( + "Table clustering fields cannot be changed after the table has " + f"been created. Existing clustering fields: {table.clustering_fields}" + ) + sql = self.to_sql(array_value, ordered=False) - job_config = bigquery.QueryJobConfig( - write_disposition=dispositions[if_exists], - destination=destination, - clustering_fields=cluster_cols if cluster_cols else None, - ) + if table_exists and _if_schema_match(table.schema, array_value.schema): + # b/409086472: Uses DML for table appends and replacements to avoid + # BigQuery `RATE_LIMIT_EXCEEDED` errors, as per quota limits: + # https://cloud.google.com/bigquery/quotas#standard_tables + job_config = bigquery.QueryJobConfig() + ir = sqlglot_ir.SQLGlotIR.from_query_string(sql) + if if_exists == "append": + sql = ir.insert(destination) + else: # for "replace" + assert if_exists == "replace" + sql = ir.replace(destination) + else: + dispositions = { + "fail": bigquery.WriteDisposition.WRITE_EMPTY, + "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, + "append": bigquery.WriteDisposition.WRITE_APPEND, + } + job_config = bigquery.QueryJobConfig( + write_disposition=dispositions[if_exists], + destination=destination, + clustering_fields=cluster_cols if cluster_cols else None, + ) + # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. _, query_job = self._run_execute_query( @@ -572,6 +602,21 @@ def _execute_plan( ) +def _if_schema_match( + table_schema: Tuple[bigquery.SchemaField, ...], schema: schemata.ArraySchema +) -> bool: + if len(table_schema) != len(schema.items): + return False + for field in table_schema: + if field.name not in schema.names: + return False + if bigframes.dtypes.convert_schema_field(field)[1] != schema.get_type( + field.name + ): + return False + return True + + def _sanitize( schema: Tuple[bigquery.SchemaField, ...] ) -> Tuple[bigquery.SchemaField, ...]: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d24b592b0d..857bec67c0 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -458,7 +458,7 @@ def test_to_csv_tabs( [True, False], ) @pytest.mark.skipif(pandas_gbq is None, reason="required by pd.read_gbq") -def test_to_gbq_index(scalars_dfs, dataset_id, index): +def test_to_gbq_w_index(scalars_dfs, dataset_id, index): """Test the `to_gbq` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs destination_table = f"{dataset_id}.test_index_df_to_gbq_{index}" @@ -485,48 +485,67 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): pd.testing.assert_frame_equal(df_out, expected, check_index_type=False) -@pytest.mark.parametrize( - ("if_exists", "expected_index"), - [ - pytest.param("replace", 1), - pytest.param("append", 2), - pytest.param( - "fail", - 0, - marks=pytest.mark.xfail( - raises=google.api_core.exceptions.Conflict, - ), - ), - pytest.param( - "unknown", - 0, - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], -) -@pytest.mark.skipif(pandas_gbq is None, reason="required by pd.read_gbq") -def test_to_gbq_if_exists( - scalars_df_default_index, - scalars_pandas_df_default_index, - dataset_id, - if_exists, - expected_index, -): - """Test the `to_gbq` API with the `if_exists` parameter.""" - destination_table = f"{dataset_id}.test_to_gbq_if_exists_{if_exists}" +def test_to_gbq_if_exists_is_fail(scalars_dfs, dataset_id): + scalars_df, scalars_pandas_df = scalars_dfs + destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_fails" + scalars_df.to_gbq(destination_table) - scalars_df_default_index.to_gbq(destination_table) - scalars_df_default_index.to_gbq(destination_table, if_exists=if_exists) + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) - gcs_df = pd.read_gbq(destination_table) - assert len(gcs_df.index) == expected_index * len( - scalars_pandas_df_default_index.index - ) - pd.testing.assert_index_equal( - gcs_df.columns, scalars_pandas_df_default_index.columns - ) + # Test default value is "fails" + with pytest.raises(ValueError, match="Table already exists"): + scalars_df.to_gbq(destination_table) + + with pytest.raises(ValueError, match="Table already exists"): + scalars_df.to_gbq(destination_table, if_exists="fail") + + +def test_to_gbq_if_exists_is_replace(scalars_dfs, dataset_id): + scalars_df, scalars_pandas_df = scalars_dfs + destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_replace" + scalars_df.to_gbq(destination_table) + + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When replacing a table with same schema + scalars_df.to_gbq(destination_table, if_exists="replace") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When replacing a table with different schema + partitial_scalars_df = scalars_df.drop(columns=["string_col"]) + partitial_scalars_df.to_gbq(destination_table, if_exists="replace") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(partitial_scalars_df) + pd.testing.assert_index_equal(gcs_df.columns, partitial_scalars_df.columns) + + +def test_to_gbq_if_exists_is_append(scalars_dfs, dataset_id): + scalars_df, scalars_pandas_df = scalars_dfs + destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_append" + scalars_df.to_gbq(destination_table) + + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When appending to a table with same schema + scalars_df.to_gbq(destination_table, if_exists="append") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == 2 * len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When appending to a table with different schema + partitial_scalars_df = scalars_df.drop(columns=["string_col"]) + partitial_scalars_df.to_gbq(destination_table, if_exists="append") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == 3 * len(partitial_scalars_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_df.columns) def test_to_gbq_w_duplicate_column_names( @@ -773,6 +792,27 @@ def test_to_gbq_w_clustering_no_destination( assert table.expires is not None +def test_to_gbq_w_clustering_existing_table( + scalars_df_default_index, + dataset_id, + bigquery_client, +): + destination_table = f"{dataset_id}.test_to_gbq_w_clustering_existing_table" + scalars_df_default_index.to_gbq(destination_table) + + table = bigquery_client.get_table(destination_table) + assert table.clustering_fields is None + assert table.expires is None + + with pytest.raises(ValueError, match="Table clustering fields cannot be changed"): + clustering_columns = ["int64_col"] + scalars_df_default_index.to_gbq( + destination_table, + if_exists="replace", + clustering_columns=clustering_columns, + ) + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id") diff --git a/tests/unit/test_dataframe_io.py b/tests/unit/test_dataframe_io.py index 7845a71134..f2c0241396 100644 --- a/tests/unit/test_dataframe_io.py +++ b/tests/unit/test_dataframe_io.py @@ -49,3 +49,8 @@ def test_dataframe_to_pandas(mock_df, api_name, kwargs): mock_df.to_pandas.assert_called_once_with( allow_large_results=kwargs["allow_large_results"] ) + + +def test_to_gbq_if_exists_invalid(mock_df): + with pytest.raises(ValueError, match="Got invalid value 'invalid' for if_exists."): + mock_df.to_gbq("a.b.c", if_exists="invalid") From 2858b1efb4fe74097dcb17c086ee1dc18e53053c Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 16 May 2025 13:51:49 -0700 Subject: [PATCH 15/52] perf: Faster local data comparison using idenitity (#1738) --- bigframes/core/local_data.py | 4 ++-- tests/unit/test_local_data.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index d23f3538dd..2e8c4aff44 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -54,8 +54,8 @@ def from_arrow(cls, table: pa.Table) -> LocalTableMetadata: @dataclasses.dataclass(frozen=True) class ManagedArrowTable: - data: pa.Table = dataclasses.field(hash=False) - schema: schemata.ArraySchema = dataclasses.field(hash=False) + data: pa.Table = dataclasses.field(hash=False, compare=False) + schema: schemata.ArraySchema = dataclasses.field(hash=False, compare=False) id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4) @functools.cached_property diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py index bb7330aba4..71479e89d4 100644 --- a/tests/unit/test_local_data.py +++ b/tests/unit/test_local_data.py @@ -64,3 +64,16 @@ def test_local_data_well_formed_round_trip_sliced(): result.reset_index(drop=True), check_dtype=False, ) + + +def test_local_data_equal_self(): + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + assert local_entry == local_entry + assert hash(local_entry) == hash(local_entry) + + +def test_local_data_not_equal_other(): + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + local_entry2 = local_data.ManagedArrowTable.from_pandas(pd_data[::2]) + assert local_entry != local_entry2 + assert hash(local_entry) != hash(local_entry2) From 3ea6043be7025fa7a11cca27b02f5505bbc9b129 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 16 May 2025 14:13:11 -0700 Subject: [PATCH 16/52] feat: Add DataFrame.round method (#1742) * feat: Add DataFrame.round method * docstring full typename * restrict round test to pandas 2.x * add type hint --- bigframes/core/compile/scalar_op_compiler.py | 7 ++ bigframes/dataframe.py | 42 ++++++++++ bigframes/operations/numeric_ops.py | 2 +- tests/system/small/test_dataframe.py | 19 +++++ .../bigframes_vendored/pandas/core/frame.py | 77 +++++++++++++++++++ 5 files changed, 146 insertions(+), 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a1cf72be97..6576276b11 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1826,6 +1826,13 @@ def fillna_op( @scalar_op_compiler.register_binary_op(ops.round_op) def round_op(x: ibis_types.Value, y: ibis_types.Value): + if x.type().is_integer(): + # bq produces float64, but pandas returns int + return ( + typing.cast(ibis_types.NumericValue, x) + .round(digits=typing.cast(ibis_types.IntegerValue, y)) + .cast(ibis_dtypes.int64) + ) return typing.cast(ibis_types.NumericValue, x).round( digits=typing.cast(ibis_types.IntegerValue, y) ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a98733b48a..cba635062f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -27,6 +27,7 @@ from typing import ( Callable, Dict, + Hashable, Iterable, List, Literal, @@ -3608,6 +3609,47 @@ def _groupby_series( def abs(self) -> DataFrame: return self._apply_unary_op(ops.abs_op) + def round(self, decimals: Union[int, dict[Hashable, int]] = 0) -> DataFrame: + is_mapping = utils.is_dict_like(decimals) + if not (is_mapping or isinstance(decimals, int)): + raise TypeError("'decimals' must be either a dict-like or integer.") + block = self._block + exprs = [] + for label, col_id, dtype in zip( + block.column_labels, block.value_columns, block.dtypes + ): + if dtype in set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) - { + bigframes.dtypes.BOOL_DTYPE + }: + if is_mapping: + if label in decimals: # type: ignore + exprs.append( + ops.round_op.as_expr( + col_id, + ex.const( + decimals[label], dtype=bigframes.dtypes.INT_DTYPE # type: ignore + ), + ) + ) + else: + exprs.append(ex.deref(col_id)) + else: + exprs.append( + ops.round_op.as_expr( + col_id, + ex.const( + typing.cast(int, decimals), + dtype=bigframes.dtypes.INT_DTYPE, + ), + ) + ) + else: + exprs.append(ex.deref(col_id)) + + return DataFrame( + block.project_exprs(exprs, labels=block.column_labels, drop=True) + ) + def isna(self) -> DataFrame: return self._apply_unary_op(ops.isnull_op) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 9d6749a169..b9820cd0ea 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -289,7 +289,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) round_op = base_ops.create_binary_op( - name="round", type_signature=op_typing.BINARY_REAL_NUMERIC + name="round", type_signature=op_typing.BINARY_NUMERIC ) unsafe_pow_op = base_ops.create_binary_op( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 00c11d073e..582d164540 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1635,6 +1635,25 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): ) +@pytest.mark.parametrize( + ("decimals",), + [ + (2,), + ({"float64_col": 0, "bool_col": 1, "int64_too": -3},), + ({},), + ], +) +def test_dataframe_round(scalars_dfs, decimals): + if pd.__version__.startswith("1."): + pytest.skip("Rounding doesn't work as expected in pandas 1.x") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.round(decimals).to_pandas() + pd_result = scalars_pandas_df.round(decimals) + + assert_pandas_df_equal(bf_result, pd_result) + + def test_get_dtypes(scalars_df_default_index): dtypes = scalars_df_default_index.dtypes dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 5bbf72b421..9bb25cb5a4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4788,6 +4788,83 @@ def merge( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def round(self, decimals): + """ + Round a DataFrame to a variable number of decimal places. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats']) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + [4 rows x 2 columns] + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + [4 rows x 2 columns] + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + [4 rows x 2 columns] + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + [4 rows x 2 columns] + + Args: + decimals (int, dict, Series): + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + + Returns: + bigframes.pandas.DataFrame: + A DataFrame with the affected columns rounded to the specified + number of decimal places. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def apply(self, func, *, axis=0, args=(), **kwargs): """Apply a function along an axis of the DataFrame. From 1df8ca6312ee428d55c2091a00c73b13d9a6b193 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 16 May 2025 14:39:59 -0700 Subject: [PATCH 17/52] docs: updated multimodal notebook instructions (#1745) --- .../multimodal/multimodal_dataframe.ipynb | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index b7d713c342..3f36c2908a 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -72,6 +72,22 @@ "### Setup" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the latest bigframes package if bigframes version < 2.4.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install bigframes --upgrade" + ] + }, { "cell_type": "code", "execution_count": null, @@ -84,10 +100,13 @@ }, "outputs": [], "source": [ - "PROJECT = \"bigframes-dev\" # replace with your project, project needs to be allowlisted go/bq-multimodal-allowlist (internal)\n", - "# User must have https://cloud.google.com/bigquery/docs/use-bigquery-dataframes#permissions to use bigframes, BQ connection admin/user to create/use connections, BQ ObjRef permissions for ObjectRef and BQ routines permissions for using transform functions.\n", - "# Or simply has BQ Admin role for all.\n", - "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket.\n", + "PROJECT = \"bigframes-dev\" # replace with your project. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", + "\n", + "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", + "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", + "# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.\n", "\n", "import bigframes\n", "# Setup project\n", @@ -414,13 +433,6 @@ "chunked = df_pdf[\"chunked\"].explode()\n", "chunked" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 80aad9af794c2e06d1608c879f459a836fd4448b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 16 May 2025 15:16:59 -0700 Subject: [PATCH 18/52] feat: add deprecation warning to Gemini-1.5-X, text-embedding-004, and remove remove legacy models in notebooks and docs (#1723) * change all model_name from gemini-1.5-pro to gemini-2.0 due to model deprication * add warnign for deprcated models * add space for style * rewording note * test change * fix failed test * add comment back * use warning instead * remove replcated notes --- bigframes/ml/llm.py | 28 +++++++++++++++++++ bigframes/operations/semantics.py | 2 +- .../apps/synthetic_data_generation.ipynb | 2 +- .../bq_dataframes_llm_code_generation.ipynb | 2 +- .../bq_dataframes_llm_kmeans.ipynb | 2 +- .../bq_dataframes_llm_vector_search.ipynb | 2 +- ...q_dataframes_ml_drug_name_generation.ipynb | 2 +- .../generative_ai/large_language_models.ipynb | 16 +++++------ .../bq_dataframes_template.ipynb | 2 +- samples/snippets/gemini_model_test.py | 2 +- samples/snippets/multimodal_test.py | 2 +- tests/system/large/operations/conftest.py | 2 +- tests/system/large/operations/test_ai.py | 6 ++-- .../system/large/operations/test_semantics.py | 15 +++++----- tests/system/load/test_llm.py | 2 +- 15 files changed, 58 insertions(+), 29 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 591d18e3b5..11861c786e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -112,11 +112,18 @@ "If you proceed with '{model_name}', it might not work as expected or could lead to errors with multimodal inputs." ) +_MODEL_DEPRECATE_WARNING = ( + "'{model_name}' is going to be deprecated. Use '{new_model_name}' ({link}) instead." +) + @log_adapter.class_logger class TextEmbeddingGenerator(base.RetriableRemotePredictor): """Text embedding generator LLM model. + .. note:: + text-embedding-004 is going to be deprecated. Use text-embedding-005(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. + Args: model_name (str, Default to "text-embedding-004"): The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004" @@ -169,6 +176,15 @@ def _create_bqml_model(self): ) ) warnings.warn(msg) + if self.model_name == "text-embedding-004": + msg = exceptions.format_message( + _MODEL_DEPRECATE_WARNING.format( + model_name=self.model_name, + new_model_name="text-embedding-005", + link="https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator", + ) + ) + warnings.warn(msg) options = { "endpoint": self.model_name, @@ -416,6 +432,7 @@ class GeminiTextGenerator(base.RetriableRemotePredictor): default and a warning will be issued. .. note:: + "gemini-1.5-X" is going to be deprecated. Please use gemini-2.0-X instead. For example, "gemini-2.0-flash-001". "gemini-2.0-flash-exp", "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions @@ -461,10 +478,12 @@ def __init__( "(https://cloud.google.com/products#product-launch-stages)." ) warnings.warn(msg, category=exceptions.PreviewWarning) + if model_name is None: model_name = "gemini-2.0-flash-001" msg = exceptions.format_message(_REMOVE_DEFAULT_MODEL_WARNING) warnings.warn(msg, category=FutureWarning, stacklevel=2) + self.model_name = model_name self.session = session or global_session.get_global_session() self.max_iterations = max_iterations @@ -487,6 +506,15 @@ def _create_bqml_model(self): ) ) warnings.warn(msg) + if self.model_name.startswith("gemini-1.5"): + msg = exceptions.format_message( + _MODEL_DEPRECATE_WARNING.format( + model_name=self.model_name, + new_model_name="gemini-2.0-X", + link="https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator", + ) + ) + warnings.warn(msg) options = {"endpoint": self.model_name} diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 60d619992a..9fa5450748 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -252,7 +252,7 @@ def cluster_by( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator() + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") >>> df = bpd.DataFrame({ ... "Product": ["Smartphone", "Laptop", "T-shirt", "Jeans"], diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb index f830e35c16..b59777a5da 100644 --- a/notebooks/apps/synthetic_data_generation.ipynb +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -111,7 +111,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 788111cfe6..edb864613c 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -430,7 +430,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 31a47ea424..9b05e1ab02 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -1614,7 +1614,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "q_a_model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "q_a_model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb index a15209aae4..15929fd666 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -1816,7 +1816,7 @@ "source": [ "## gemini model\n", "\n", - "llm_model = bf_llm.GeminiTextGenerator(model_name = \"gemini-1.5-flash-002\") ## replace with other model as needed" + "llm_model = bf_llm.GeminiTextGenerator(model_name = \"gemini-2.0-flash-001\") ## replace with other model as needed" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index c65a0f0854..413e473c2f 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -581,7 +581,7 @@ ], "source": [ "# Define the model\n", - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")\n", + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", "\n", "# Invoke LLM with prompt\n", "response = predict(zero_shot_prompt, temperature = TEMPERATURE)\n", diff --git a/notebooks/generative_ai/large_language_models.ipynb b/notebooks/generative_ai/large_language_models.ipynb index 4a0d2f2b3c..1d7bc7f6ef 100644 --- a/notebooks/generative_ai/large_language_models.ipynb +++ b/notebooks/generative_ai/large_language_models.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -21,23 +21,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_604997/3896046356.py:1: ApiDeprecationWarning: gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. \n", - " model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/llm.py:981: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + "/tmp/ipykernel_176683/987800245.py:1: ApiDeprecationWarning: gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. \n", + " model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/llm.py:486: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", " self.session = session or global_session.get_global_session()\n" ] }, { "data": { "text/html": [ - "Query job dd2da3cc-27c3-4c6f-9936-4f7769c85090 is DONE. 0 Bytes processed. Open Job" + "Query job 6fa5121a-6da4-4c75-92ec-936799da4513 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -49,7 +49,7 @@ { "data": { "text/html": [ - "Query job 00947011-4d7c-42fa-ae19-3b684976cec6 is DONE. 0 Bytes processed. Open Job" + "Query job 74460ae9-3e89-49e7-93ad-bafbb6197a86 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -60,7 +60,7 @@ } ], "source": [ - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 12847483ac..68c5e9f74d 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -1419,7 +1419,7 @@ "source": [ "# from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "# model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")\n", + "# model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", "\n", "# pred = model.predict(df)\n", "# pred" diff --git a/samples/snippets/gemini_model_test.py b/samples/snippets/gemini_model_test.py index cf809ebb3a..fe5d7d5b1e 100644 --- a/samples/snippets/gemini_model_test.py +++ b/samples/snippets/gemini_model_test.py @@ -30,7 +30,7 @@ def test_gemini_text_generator_model() -> None: session = bpd.get_global_session() connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" model = GeminiTextGenerator( - session=session, connection_name=connection, model_name="gemini-1.5-flash-002" + session=session, connection_name=connection, model_name="gemini-2.0-flash-001" ) df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 368f82d849..7f8e13cd7b 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -78,7 +78,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: # [START bigquery_dataframes_multimodal_dataframe_ml_text] from bigframes.ml import llm - gemini = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-002") + gemini = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # Deal with first 2 images as example df_image = df_image.head(2) diff --git a/tests/system/large/operations/conftest.py b/tests/system/large/operations/conftest.py index 4f6e2d1704..6f64c7552f 100644 --- a/tests/system/large/operations/conftest.py +++ b/tests/system/large/operations/conftest.py @@ -22,7 +22,7 @@ def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator: return llm.GeminiTextGenerator( session=session, connection_name=bq_connection, - model_name="gemini-1.5-flash-001", + model_name="gemini-2.0-flash-001", ) diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index 04074a2ea6..c2797e39ee 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -434,7 +434,7 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) def test_self_join(session, gemini_flash_model): animals = dataframe.DataFrame( data={ - "animal": ["spider", "capybara"], + "animal": ["ant", "elephant"], }, session=session, ) @@ -453,8 +453,8 @@ def test_self_join(session, gemini_flash_model): expected_df = pd.DataFrame( { - "animal_left": ["capybara"], - "animal_right": ["spider"], + "animal_left": ["elephant"], + "animal_right": ["ant"], } ) pandas.testing.assert_frame_equal( diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 3517b1adbc..7ae78a5c53 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -86,7 +86,7 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): cluster_column=cluster_column, ).to_pandas() - expected_s = pd.Series(["Leonardo \n"], dtype=dtypes.STRING_DTYPE) + expected_s = pd.Series(["Leonardo\n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Movies" pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) @@ -137,12 +137,13 @@ def test_agg_w_int_column(session, gemini_flash_model): "Movies": [ "Killers of the Flower Moon", "The Great Gatsby", + "The Wolf of Wall Street", ], - "Years": [2023, 2013], + "Years": [2023, 2013, 2013], }, session=session, ) - instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Answer with the year only." + instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Your answer should be the four-digit year, returned as a string." with bigframes.option_context( SEM_OP_EXP_OPTION, @@ -155,7 +156,7 @@ def test_agg_w_int_column(session, gemini_flash_model): model=gemini_flash_model, ).to_pandas() - expected_s = pd.Series(["2013 \n"], dtype=dtypes.STRING_DTYPE) + expected_s = pd.Series(["2013\n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Years" pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) @@ -764,7 +765,7 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) def test_self_join(session, gemini_flash_model): animals = dataframe.DataFrame( data={ - "animal": ["spider", "capybara"], + "animal": ["ant", "elephant"], }, session=session, ) @@ -783,8 +784,8 @@ def test_self_join(session, gemini_flash_model): expected_df = pd.DataFrame( { - "animal_left": ["capybara"], - "animal_right": ["spider"], + "animal_left": ["elephant"], + "animal_right": ["ant"], } ) pandas.testing.assert_frame_equal( diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 49f79d9d44..354aebcac5 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -81,7 +81,7 @@ def test_llm_gemini_configure_fit( @pytest.mark.flaky(retries=2) def test_llm_gemini_w_ground_with_google_search(llm_remote_text_df): - model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-002", max_iterations=1) + model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001", max_iterations=1) df = model.predict( llm_remote_text_df["prompt"], ground_with_google_search=True, From 133ac6b0e1f1e7a12844a4b6fd5b26df59f7ef37 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 19 May 2025 14:30:56 -0700 Subject: [PATCH 19/52] feat!: add structured output for ai map, ai filter and ai join (#1746) * add structured output for ai map, ai filter and ai join * fix mypy * fix test * update notebook --- bigframes/operations/ai.py | 140 +++++++++++----------- notebooks/experimental/ai_operators.ipynb | 114 ++++++++++++++++-- tests/system/large/operations/test_ai.py | 99 +++++++++++++-- tests/system/small/operations/test_ai.py | 30 ++++- 4 files changed, 291 insertions(+), 92 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 9d73fd43c1..c65947f53f 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import re import typing -from typing import List, Optional +from typing import Dict, List, Optional import warnings import numpy as np @@ -34,7 +36,13 @@ def __init__(self, df) -> None: self._df: bigframes.dataframe.DataFrame = df - def filter(self, instruction: str, model, ground_with_google_search: bool = False): + def filter( + self, + instruction: str, + model, + ground_with_google_search: bool = False, + attach_logprobs: bool = False, + ): """ Filters the DataFrame with the semantics of the user instruction. @@ -74,6 +82,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + Returns: bigframes.pandas.DataFrame: DataFrame filtered by the instruction. @@ -82,72 +94,27 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ - import bigframes.dataframe - import bigframes.series - self._validate_model(model) - columns = self._parse_columns(instruction) - for column in columns: - if column not in self._df.columns: - raise ValueError(f"Column {column} not found.") + answer_col = "answer" - if ground_with_google_search: - msg = exceptions.format_message( - "Enables Grounding with Google Search may impact billing cost. See pricing " - "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" - ) - warnings.warn(msg, category=UserWarning) - - self._confirm_operation(len(self._df)) - - df: bigframes.dataframe.DataFrame = self._df[columns].copy() - has_blob_column = False - for column in columns: - if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string - has_blob_column = True - continue - - if df[column].dtype != dtypes.STRING_DTYPE: - df[column] = df[column].astype(dtypes.STRING_DTYPE) - - user_instruction = self._format_instruction(instruction, columns) - output_instruction = "Based on the provided context, reply to the following claim by only True or False:" - - if has_blob_column: - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - df, - prompt=self._make_multimodel_prompt( - df, columns, user_instruction, output_instruction - ), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) - else: - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - self._make_text_prompt( - df, columns, user_instruction, output_instruction - ), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) + output_schema = {answer_col: "bool"} + result = self.map( + instruction, + model, + output_schema, + ground_with_google_search, + attach_logprobs, + ) - return self._df[ - results["ml_generate_text_llm_result"].str.lower().str.contains("true") - ] + return result[result[answer_col]].drop(answer_col, axis=1) def map( self, instruction: str, - output_column: str, model, + output_schema: Dict[str, str] | None = None, ground_with_google_search: bool = False, + attach_logprobs=False, ): """ Maps the DataFrame with the semantics of the user instruction. @@ -163,7 +130,7 @@ def map( >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) - >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"}) ingredient_1 ingredient_2 food 0 Burger Bun Beef Patty Burger @@ -180,12 +147,14 @@ def map( in the instructions like: "Get the ingredients of {food}." - output_column (str): - The column name of the mapping result. - model (bigframes.ml.llm.GeminiTextGenerator): A GeminiTextGenerator provided by Bigframes ML package. + output_schema (Dict[str, str] or None, default None): + The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of :. + Supported types are int64, float64, bool, string, array and struct. If None, generate string result under the column + "ml_generate_text_llm_result". + ground_with_google_search (bool, default False): Enables Grounding with Google Search for the GeminiTextGenerator model. When set to True, the model incorporates relevant information from Google @@ -194,6 +163,11 @@ def map( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + + Returns: bigframes.pandas.DataFrame: DataFrame with attached mapping results. @@ -236,6 +210,9 @@ def map( "Based on the provided contenxt, answer the following instruction:" ) + if output_schema is None: + output_schema = {"ml_generate_text_llm_result": "string"} + if has_blob_column: results = typing.cast( bigframes.series.Series, @@ -246,7 +223,8 @@ def map( ), temperature=0.0, ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], + output_schema=output_schema, + ), ) else: results = typing.cast( @@ -257,12 +235,28 @@ def map( ), temperature=0.0, ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], + output_schema=output_schema, + ), + ) + + attach_columns = [results[col] for col, _ in output_schema.items()] + + def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series: + from bigframes import bigquery as bbq + + logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0] + logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype( + "Float64" ) + logprobs.name = "logprob" + return logprobs + + if attach_logprobs: + attach_columns.append(extract_logprob(results["full_response"])) from bigframes.core.reshape.api import concat - return concat([self._df, results.rename(output_column)], axis=1) + return concat([self._df, *attach_columns], axis=1) def join( self, @@ -270,6 +264,7 @@ def join( instruction: str, model, ground_with_google_search: bool = False, + attach_logprobs=False, ): """ Joines two dataframes by applying the instruction over each pair of rows from @@ -313,10 +308,6 @@ def join( model (bigframes.ml.llm.GeminiTextGenerator): A GeminiTextGenerator provided by Bigframes ML package. - max_rows (int, default 1000): - The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method - call will end early with an error. - ground_with_google_search (bool, default False): Enables Grounding with Google Search for the GeminiTextGenerator model. When set to True, the model incorporates relevant information from Google @@ -325,6 +316,10 @@ def join( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + Returns: bigframes.pandas.DataFrame: The joined dataframe. @@ -400,7 +395,10 @@ def join( joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right")) return joined_df.ai.filter( - instruction, model, ground_with_google_search=ground_with_google_search + instruction, + model, + ground_with_google_search=ground_with_google_search, + attach_logprobs=attach_logprobs, ).reset_index(drop=True) def search( diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 9f35d3864a..49a9d798e2 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -139,7 +139,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:54: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", "the future.\n", " warnings.warn(msg, category=bfe.PreviewWarning)\n" ] @@ -160,13 +160,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "vCkraKOeqJFl" }, "outputs": [], "source": [ - "bpd.options.bigquery.project = 'YOUR_PROJECT_ID'\n", + "bpd.options.bigquery.project = 'bigframes-dev'\n", "bpd.options.bigquery.location = 'US'" ] }, @@ -227,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "F4dZm4b7iouR" }, @@ -248,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "BoUK-cpbiouS" }, @@ -403,7 +403,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\n", " warnings.warn(msg, bfe.PreviewWarning)\n" ] @@ -575,12 +575,108 @@ "id": "VFObP2aFiouS" }, "source": [ - "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the column name by setting the `output_column` parameter to hold the mapping results." + "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the output column name." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are using BigFrames version `2.5.0` or later, the column name is specified with the `output_schema` parameter. This parameter expects a dictionary input in the form of `{'col_name': 'type_name'}`." ] }, { "cell_type": "code", "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ingredient_1ingredient_2food
0BunBeef PattyHamburger
1Soy BeanBitternTofu
2SausageLong BreadHotdog
\n", + "

3 rows × 3 columns

\n", + "
[3 rows x 3 columns in total]" + ], + "text/plain": [ + " ingredient_1 ingredient_2 food\n", + "0 Bun Beef Patty Hamburger\n", + "1 Soy Bean Bittern Tofu\n", + "2 Sausage Long Bread Hotdog\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", model=gemini_model, output_schema={\"food\": \"string\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are using BigFrames version 2.4.0 or prior, the column name is specified wit the `output_column` parameter. The outputs are always strings." + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -667,7 +763,7 @@ } ], "source": [ - "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" ] }, { @@ -3170,7 +3266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.17" } }, "nbformat": 4, diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index c2797e39ee..1b1d3a3376 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -66,6 +66,31 @@ def test_filter(session, gemini_flash_model): ) +def test_filter_attach_logprob(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "number_1": [1, 2], + "number_2": [2, 1], + "col": [0, 0], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{number_1} is greater than {number_2}", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + def test_filter_multi_model(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -186,7 +211,14 @@ def test_filter_invalid_model_raise_error(): df.ai.filter("{city} is the capital of {country}", None) -def test_map(session, gemini_flash_model): +@pytest.mark.parametrize( + ("output_schema", "output_col"), + [ + pytest.param(None, "ml_generate_text_llm_result", id="default_schema"), + pytest.param({"food": "string"}, "food", id="non_default_schema"), + ], +) +def test_map(session, gemini_flash_model, output_schema, output_col): df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -204,18 +236,18 @@ def test_map(session, gemini_flash_model): ): actual_df = df.ai.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", gemini_flash_model, + output_schema=output_schema, ).to_pandas() # Result sanitation - actual_df["food"] = actual_df["food"].str.strip().str.lower() + actual_df[output_col] = actual_df[output_col].str.strip().str.lower() expected_df = pd.DataFrame( { "ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"], "gluten-free": [True, True], - "food": ["burger", "tofu"], + output_col: ["burger", "tofu"], } ) pandas.testing.assert_frame_equal( @@ -227,6 +259,31 @@ def test_map(session, gemini_flash_model): ) +def test_map_attach_logprob(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + def test_map_multimodel(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -244,8 +301,8 @@ def test_map_multimodel(session, gemini_flash_model): ) result = df.ai.map( "What is the object in {image} combined with {scenario}? One word only.", - "object", gemini_flash_model, + output_schema={"object": "string"}, ).to_pandas() assert len(result) == len(df) @@ -279,7 +336,6 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): ): df.ai.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", gemini_flash_model, ) @@ -319,7 +375,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): THRESHOLD_OPTION, 10, ), pytest.raises(ValueError): - df.ai.map(instruction, "food", gemini_flash_model) + df.ai.map(instruction, gemini_flash_model, output_schema={"food": "string"}) def test_map_invalid_model_raise_error(): @@ -338,7 +394,6 @@ def test_map_invalid_model_raise_error(): ), pytest.raises(TypeError): df.ai.map( "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", None, ) @@ -396,6 +451,34 @@ def test_join(instruction, session, gemini_flash_model): ) +def test_join_attach_logprob(session, gemini_flash_model): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.ai.join( + countries, + "{city} is in {country}", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + @pytest.mark.parametrize( ("reply"), [ diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index de6ba4b86c..25d411bef8 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -51,7 +51,11 @@ def test_filter(session): df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) model = FakeGeminiTextGenerator( dataframe.DataFrame( - {"ml_generate_text_llm_result": ["true", "false"]}, session=session + { + "answer": [True, False], + "full_response": _create_dummy_full_response(2), + }, + session=session, ), ) @@ -77,7 +81,11 @@ def test_map(session): df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) model = FakeGeminiTextGenerator( dataframe.DataFrame( - {"ml_generate_text_llm_result": ["true", "false"]}, session=session + { + "output": ["true", "false"], + "full_response": _create_dummy_full_response(2), + }, + session=session, ), ) @@ -87,7 +95,9 @@ def test_map(session): THRESHOLD_OPTION, 50, ): - result = df.ai.map("map {col}", model=model, output_column="output").to_pandas() + result = df.ai.map( + "map {col}", model=model, output_schema={"output": "string"} + ).to_pandas() pandas.testing.assert_frame_equal( result, @@ -102,7 +112,13 @@ def test_join(session): left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) model = FakeGeminiTextGenerator( - dataframe.DataFrame({"ml_generate_text_llm_result": ["true"]}, session=session), + dataframe.DataFrame( + { + "answer": [True], + "full_response": _create_dummy_full_response(1), + }, + session=session, + ), ) with bigframes.option_context( @@ -139,3 +155,9 @@ def test_top_k(session): result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() assert len(result) == 1 + + +def _create_dummy_full_response(row_count: int) -> pd.Series: + entry = """{"candidates": [{"avg_logprobs": -0.5}]}""" + + return pd.Series([entry] * row_count) From d451aefd2181aef250c3b48cceac09063081cab2 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 20 May 2025 10:17:15 -0700 Subject: [PATCH 20/52] fix: Fix clip int series with float bounds (#1739) --- bigframes/core/compile/scalar_op_compiler.py | 22 +++----------------- bigframes/operations/base.py | 6 +++--- bigframes/series.py | 7 +++++-- tests/system/small/test_series.py | 11 ++++++++++ 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 6576276b11..228c866e1a 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1935,34 +1935,18 @@ def clip_op( if isinstance(lower, ibis_types.NullScalar) and ( not isinstance(upper, ibis_types.NullScalar) ): - return ( - ibis_api.case() # type: ignore - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) + return ibis_api.least(original, upper) elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( upper, ibis_types.NullScalar ): - return ( - ibis_api.case() # type: ignore - .when(lower.isnull() | (original < lower), lower) - .else_(original) - .end() - ) + return ibis_api.greatest(original, lower) elif isinstance(lower, ibis_types.NullScalar) and ( isinstance(upper, ibis_types.NullScalar) ): return original else: # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound - return ( - ibis_api.case() # type: ignore - .when(lower.isnull() | (original < lower), lower) - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) + return ibis_api.greatest(ibis_api.least(original, upper), lower) # N-ary Operations diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 8d70596b7d..c316d28321 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -245,9 +245,9 @@ def _align( ) return (typing.cast(ex.DerefOp, values[0]), values[1], block) - def _align3(self, other1: series.Series | scalars.Scalar, other2: series.Series | scalars.Scalar, how="left") -> tuple[ex.DerefOp, AlignedExprT, AlignedExprT, blocks.Block]: # type: ignore + def _align3(self, other1: series.Series | scalars.Scalar, other2: series.Series | scalars.Scalar, how="left", cast_scalars: bool = True) -> tuple[ex.DerefOp, AlignedExprT, AlignedExprT, blocks.Block]: # type: ignore """Aligns the series value with 2 other scalars or series objects. Returns new values and joined tabled expression.""" - values, index = self._align_n([other1, other2], how) + values, index = self._align_n([other1, other2], how, cast_scalars=cast_scalars) return ( typing.cast(ex.DerefOp, values[0]), values[1], @@ -260,7 +260,7 @@ def _align_n( others: typing.Sequence[typing.Union[series.Series, scalars.Scalar]], how="outer", ignore_self=False, - cast_scalars: bool = True, + cast_scalars: bool = False, ) -> tuple[ typing.Sequence[Union[ex.ScalarConstantExpression, ex.DerefOp]], blocks.Block, diff --git a/bigframes/series.py b/bigframes/series.py index 2c387734d3..626cf2fc76 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1353,14 +1353,17 @@ def where(self, cond, other=None): ) return Series(block.select_column(result_id).with_column_labels([self.name])) - def clip(self, lower, upper): + def clip(self, lower=None, upper=None): if lower is None and upper is None: return self if lower is None: return self._apply_binary_op(upper, ops.minimum_op, alignment="left") if upper is None: return self._apply_binary_op(lower, ops.maximum_op, alignment="left") - value_id, lower_id, upper_id, block = self._align3(lower, upper) + # special rule to coerce scalar string args to date + value_id, lower_id, upper_id, block = self._align3( + lower, upper, cast_scalars=(bigframes.dtypes.is_date_like(self.dtype)) + ) block, result_id = block.project_expr( ops.clip_op.as_expr(value_id, lower_id, upper_id), ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 7972fbe1e9..286a480d18 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3001,6 +3001,17 @@ def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): assert_series_equal(bf_result, pd_result, ignore_order=not ordered) +def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_too"] + bf_result = col_bf.clip(-100, 3.14151593).to_pandas() + + col_pd = scalars_pandas_df_index["int64_too"] + # pandas doesn't work with Int64 and clip with floats + pd_result = col_pd.astype("int64").clip(-100, 3.14151593).astype("Float64") + + assert_series_equal(bf_result, pd_result) + + def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 From 36c359d2521089e186a412d353daf9de6cfbc8f4 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 20 May 2025 12:53:11 -0700 Subject: [PATCH 21/52] feat: add bpd.options.reset() method (#1743) * feat: add bpd.options.reset() method * reset experimental flag --- bigframes/_config/__init__.py | 9 +++++++++ samples/snippets/quickstart.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index 775ef70bc7..52b47e3e9a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -56,12 +56,21 @@ class Options: """Global options affecting BigQuery DataFrames behavior.""" def __init__(self): + self.reset() + + def reset(self) -> Options: + """Reset the option settings to defaults. + + Returns: + bigframes._config.Options: Options object with default values. + """ self._local = ThreadLocalConfig() # BigQuery options are special because they can only be set once per # session, so we need an indicator as to whether we are using the # thread-local session or the global session. self._bigquery_options = bigquery_options.BigQueryOptions() + return self def _init_bigquery_thread_local(self): """Initialize thread-local options, based on current global options.""" diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index 8b3b08884a..bc05cd2512 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -74,4 +74,4 @@ def run_quickstart(project_id: str) -> None: # close session and reset option so not to affect other tests bpd.close_session() - bpd.options.bigquery.ordering_mode = "strict" + bpd.options.reset() From 27fac78cb5654e5655aec861062837a7d4f3f679 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 21 May 2025 11:23:49 -0700 Subject: [PATCH 22/52] feat: support `unique()` for Index (#1750) * feat: support for Index * fix lint * fix test by limiting Pandas version --- bigframes/core/indexes/base.py | 6 ++++ tests/system/small/core/indexes/test_base.py | 35 +++++++++++++++++++ .../pandas/core/indexes/base.py | 23 ++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 tests/system/small/core/indexes/test_base.py diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 9b4b5e4290..6da68e2e8f 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -451,6 +451,12 @@ def drop_duplicates(self, *, keep: str = "first") -> Index: block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) return Index(block) + def unique(self, level: Hashable | int | None = None) -> Index: + if level is None: + return self.drop_duplicates() + + return self.get_level_values(level).drop_duplicates() + def isin(self, values) -> Index: if not utils.is_list_like(values): raise TypeError( diff --git a/tests/system/small/core/indexes/test_base.py b/tests/system/small/core/indexes/test_base.py new file mode 100644 index 0000000000..05ea40cfb9 --- /dev/null +++ b/tests/system/small/core/indexes/test_base.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from packaging import version +import pandas as pd +import pandas.testing +import pytest + + +@pytest.mark.parametrize("level", [None, 0, 1, "level0", "level1"]) +def test_unique(session, level): + if version.Version(pd.__version__) < version.Version("2.0.0"): + pytest.skip("StringDtype for multi-index not supported until Pandas 2.0") + arrays = [ + pd.Series(["A", "A", "B", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")), + pd.Series([1, 2, 1, 2, 1], dtype=pd.Int64Dtype()), + ] + pd_idx = pd.MultiIndex.from_arrays(arrays, names=["level0", "level1"]) + bf_idx = session.read_pandas(pd_idx) + + actual_result = bf_idx.unique(level).to_pandas() + + expected_result = pd_idx.unique(level) + pandas.testing.assert_index_equal(actual_result, expected_result) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index c94f707671..be1c5034f9 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,6 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py from __future__ import annotations +from collections.abc import Hashable import typing from bigframes import constants @@ -1061,6 +1062,28 @@ def drop_duplicates(self, *, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unique(self, level: Hashable | int | None = None): + """ + Returns unique values in the index. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 1, 2, 3, 3]) + >>> idx.unique() + Index([1, 2, 3], dtype='Int64') + + Args: + level (int or hashable, optional): + Only return values from specified level (for MultiIndex). + If int, gets the level by integer position, else by level name. + + Returns: + bigframes.pandas.Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. From 1cf9f5e8dba733ee26d15fc5edc44c81e094e9a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 21 May 2025 14:24:28 -0500 Subject: [PATCH 23/52] fix: prevent creating unnecessary client objects in multithreaded environments (#1757) This prevents extra authentication and default location queries in multithreaded environments. --- bigframes/pandas/io/api.py | 62 ++++++------ bigframes/session/clients.py | 177 ++++++++++++++++++++--------------- 2 files changed, 136 insertions(+), 103 deletions(-) diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 16548dd4ad..c09251de3b 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -15,6 +15,7 @@ from __future__ import annotations import inspect +import threading import typing from typing import ( Any, @@ -465,6 +466,8 @@ def from_glob_path( from_glob_path.__doc__ = inspect.getdoc(bigframes.session.Session.from_glob_path) +_default_location_lock = threading.Lock() + def _set_default_session_location_if_possible(query): # Set the location as per the query if this is the first query the user is @@ -475,31 +478,34 @@ def _set_default_session_location_if_possible(query): # If query is a table name, then it would be the location of the table. # If query is a SQL with a table, then it would be table's location. # If query is a SQL with no table, then it would be the BQ default location. - if ( - config.options.bigquery._session_started - or config.options.bigquery.location - or config.options.bigquery.use_regional_endpoints - ): - return - - clients_provider = bigframes.session.clients.ClientsProvider( - project=config.options.bigquery.project, - location=config.options.bigquery.location, - use_regional_endpoints=config.options.bigquery.use_regional_endpoints, - credentials=config.options.bigquery.credentials, - application_name=config.options.bigquery.application_name, - bq_kms_key_name=config.options.bigquery.kms_key_name, - client_endpoints_override=config.options.bigquery.client_endpoints_override, - ) - - bqclient = clients_provider.bqclient - - if bigframes.session._io.bigquery.is_query(query): - # Intentionally run outside of the session so that we can detect the - # location before creating the session. Since it's a dry_run, labels - # aren't necessary. - job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) - config.options.bigquery.location = job.location - else: - table = bqclient.get_table(query) - config.options.bigquery.location = table.location + global _default_location_lock + + with _default_location_lock: + if ( + config.options.bigquery._session_started + or config.options.bigquery.location + or config.options.bigquery.use_regional_endpoints + ): + return + + clients_provider = bigframes.session.clients.ClientsProvider( + project=config.options.bigquery.project, + location=config.options.bigquery.location, + use_regional_endpoints=config.options.bigquery.use_regional_endpoints, + credentials=config.options.bigquery.credentials, + application_name=config.options.bigquery.application_name, + bq_kms_key_name=config.options.bigquery.kms_key_name, + client_endpoints_override=config.options.bigquery.client_endpoints_override, + ) + + bqclient = clients_provider.bqclient + + if bigframes.session._io.bigquery.is_query(query): + # Intentionally run outside of the session so that we can detect the + # location before creating the session. Since it's a dry_run, labels + # aren't necessary. + job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) + config.options.bigquery.location = job.location + else: + table = bqclient.get_table(query) + config.options.bigquery.location = table.location diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 5ef974d565..a8e1ab71f1 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -15,12 +15,12 @@ """Clients manages the connection to Google APIs.""" import os +import threading import typing from typing import Optional import google.api_core.client_info import google.api_core.client_options -import google.api_core.exceptions import google.api_core.gapic_v1.client_info import google.auth.credentials import google.cloud.bigquery as bigquery @@ -84,6 +84,9 @@ def __init__( if credentials is None: credentials, credentials_project = _get_default_credentials_with_project() + # Ensure an access token is available. + credentials.refresh(google.auth.transport.requests.Request()) + # Prefer the project in this order: # 1. Project explicitly specified by the user # 2. Project set in the environment @@ -127,19 +130,30 @@ def __init__( self._client_endpoints_override = client_endpoints_override # cloud clients initialized for lazy load + self._bqclient_lock = threading.Lock() self._bqclient = None + + self._bqconnectionclient_lock = threading.Lock() self._bqconnectionclient: Optional[ google.cloud.bigquery_connection_v1.ConnectionServiceClient ] = None + + self._bqstoragereadclient_lock = threading.Lock() self._bqstoragereadclient: Optional[ google.cloud.bigquery_storage_v1.BigQueryReadClient ] = None + + self._bqstoragewriteclient_lock = threading.Lock() self._bqstoragewriteclient: Optional[ google.cloud.bigquery_storage_v1.BigQueryWriteClient ] = None + + self._cloudfunctionsclient_lock = threading.Lock() self._cloudfunctionsclient: Optional[ google.cloud.functions_v2.FunctionServiceClient ] = None + + self._resourcemanagerclient_lock = threading.Lock() self._resourcemanagerclient: Optional[ google.cloud.resourcemanager_v3.ProjectsClient ] = None @@ -166,6 +180,7 @@ def _create_bigquery_client(self): project=self._project, location=self._location, ) + if self._bq_kms_key_name: # Note: Key configuration only applies automatically to load and query jobs, not copy jobs. encryption_config = bigquery.EncryptionConfiguration( @@ -186,114 +201,126 @@ def _create_bigquery_client(self): @property def bqclient(self): - if not self._bqclient: - self._bqclient = self._create_bigquery_client() + with self._bqclient_lock: + if not self._bqclient: + self._bqclient = self._create_bigquery_client() return self._bqclient @property def bqconnectionclient(self): - if not self._bqconnectionclient: - bqconnection_options = None - if "bqconnectionclient" in self._client_endpoints_override: - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqconnectionclient"] - ) + with self._bqconnectionclient_lock: + if not self._bqconnectionclient: + bqconnection_options = None + if "bqconnectionclient" in self._client_endpoints_override: + bqconnection_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override[ + "bqconnectionclient" + ] + ) - bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqconnectionclient = ( - google.cloud.bigquery_connection_v1.ConnectionServiceClient( - client_info=bqconnection_info, - client_options=bqconnection_options, - credentials=self._credentials, + bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqconnectionclient = ( + google.cloud.bigquery_connection_v1.ConnectionServiceClient( + client_info=bqconnection_info, + client_options=bqconnection_options, + credentials=self._credentials, + ) ) - ) return self._bqconnectionclient @property def bqstoragereadclient(self): - if not self._bqstoragereadclient: - bqstorage_options = None - if "bqstoragereadclient" in self._client_endpoints_override: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqstoragereadclient"] - ) - elif self._use_regional_endpoints: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( - location=self._location + with self._bqstoragereadclient_lock: + if not self._bqstoragereadclient: + bqstorage_options = None + if "bqstoragereadclient" in self._client_endpoints_override: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override[ + "bqstoragereadclient" + ] + ) + elif self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) ) - ) - bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqstoragereadclient = ( - google.cloud.bigquery_storage_v1.BigQueryReadClient( - client_info=bqstorage_info, - client_options=bqstorage_options, - credentials=self._credentials, + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqstoragereadclient = ( + google.cloud.bigquery_storage_v1.BigQueryReadClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) ) - ) return self._bqstoragereadclient @property def bqstoragewriteclient(self): - if not self._bqstoragewriteclient: - bqstorage_options = None - if "bqstoragewriteclient" in self._client_endpoints_override: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqstoragewriteclient"] - ) - elif self._use_regional_endpoints: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( - location=self._location + with self._bqstoragewriteclient_lock: + if not self._bqstoragewriteclient: + bqstorage_options = None + if "bqstoragewriteclient" in self._client_endpoints_override: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override[ + "bqstoragewriteclient" + ] + ) + elif self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) ) - ) - bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqstoragewriteclient = ( - google.cloud.bigquery_storage_v1.BigQueryWriteClient( - client_info=bqstorage_info, - client_options=bqstorage_options, - credentials=self._credentials, + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqstoragewriteclient = ( + google.cloud.bigquery_storage_v1.BigQueryWriteClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) ) - ) return self._bqstoragewriteclient @property def cloudfunctionsclient(self): - if not self._cloudfunctionsclient: - functions_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._cloudfunctionsclient = ( - google.cloud.functions_v2.FunctionServiceClient( - client_info=functions_info, - credentials=self._credentials, + with self._cloudfunctionsclient_lock: + if not self._cloudfunctionsclient: + functions_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._cloudfunctionsclient = ( + google.cloud.functions_v2.FunctionServiceClient( + client_info=functions_info, + credentials=self._credentials, + ) ) - ) return self._cloudfunctionsclient @property def resourcemanagerclient(self): - if not self._resourcemanagerclient: - resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._resourcemanagerclient = ( - google.cloud.resourcemanager_v3.ProjectsClient( - credentials=self._credentials, client_info=resourcemanager_info + with self._resourcemanagerclient_lock: + if not self._resourcemanagerclient: + resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._resourcemanagerclient = ( + google.cloud.resourcemanager_v3.ProjectsClient( + credentials=self._credentials, client_info=resourcemanager_info + ) ) - ) return self._resourcemanagerclient From ea713bf48cfd37e2c33fe473584e9ff2fd4a2c52 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 21 May 2025 14:57:12 -0700 Subject: [PATCH 24/52] chore: Move polars_session.py to bigframes/testing module (#1760) --- {tests/unit => bigframes/testing}/polars_session.py | 0 tests/unit/test_local_engine.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename {tests/unit => bigframes/testing}/polars_session.py (100%) diff --git a/tests/unit/polars_session.py b/bigframes/testing/polars_session.py similarity index 100% rename from tests/unit/polars_session.py rename to bigframes/testing/polars_session.py diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index b4672d07a9..e36dc3df3c 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -27,7 +27,7 @@ # All tests in this file require polars to be installed to pass. @pytest.fixture(scope="module") def polars_session(): - from . import polars_session + from bigframes.testing import polars_session return polars_session.TestSession() From 68d5e2cbef3510cadc7e9dd199117c1e3b02d19f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 21 May 2025 15:44:14 -0700 Subject: [PATCH 25/52] deps: avoid `gcsfs==2025.5.0` (#1762) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: pin `gcsfs<2025.5.0` The latest gcsfs release broke BigFrames, so pinning its version to the last known good version. * Update setup.py --------- Co-authored-by: Tim Sweña (Swast) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index edd8e63e65..ff40d29a16 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ # please keep these in sync with the minimum versions in testing/constraints-3.9.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", - "gcsfs >=2023.3.0", + "gcsfs >=2023.3.0, !=2025.5.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", From 768a7570845c4eb88f495d7f3c0f3158accdc231 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 21 May 2025 15:54:45 -0700 Subject: [PATCH 26/52] feat: add support for df.loc[list, column(s)] (#1761) --- bigframes/core/indexers.py | 4 ++-- tests/system/small/test_dataframe.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 6258eb00d5..c60e40880b 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -155,8 +155,8 @@ def __getitem__(self, key): # row key. We must choose one, so bias towards treating as multi-part row label if isinstance(key, tuple) and len(key) == 2: is_row_multi_index = self._dataframe.index.nlevels > 1 - is_first_item_tuple = isinstance(key[0], tuple) - if not is_row_multi_index or is_first_item_tuple: + is_first_item_list_or_tuple = isinstance(key[0], (tuple, list)) + if not is_row_multi_index or is_first_item_list_or_tuple: df = typing.cast( bigframes.dataframe.DataFrame, _loc_getitem_series_or_dataframe(self._dataframe, key[0]), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 582d164540..596b9b17f1 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3638,9 +3638,7 @@ def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): scalars_df_index.iat[index] -def test_iloc_single_integer_out_of_bound_error( - scalars_df_index, scalars_pandas_df_index -): +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): scalars_df_index.iloc[99] @@ -3655,6 +3653,17 @@ def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): ) +def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): + idx_list = [0, 3, 5] + bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() pd_result = scalars_pandas_df_index.loc[:, "int64_col"] From c51d2b1f66379a3aaa915d6bd91837f02d8ada40 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 21 May 2025 15:59:13 -0700 Subject: [PATCH 27/52] test: remove gemini-1x model tests due to model deprecation (#1764) --- tests/system/load/test_llm.py | 2 -- tests/system/small/ml/test_llm.py | 18 ------------------ tests/system/small/ml/test_multimodal_llm.py | 8 -------- 3 files changed, 28 deletions(-) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 354aebcac5..5cf9621ef9 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -41,8 +41,6 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df): @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-002", - "gemini-1.5-flash-002", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 3f06a02469..8bfdffc140 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -108,12 +108,6 @@ def test_create_load_multimodal_embedding_generator_model( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -143,12 +137,6 @@ def test_create_load_gemini_text_generator_model( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -199,12 +187,6 @@ def test_gemini_text_generator_predict_with_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py index e066d00cf5..ba834906b2 100644 --- a/tests/system/small/ml/test_multimodal_llm.py +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -41,10 +41,6 @@ def test_multimodal_embedding_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -71,10 +67,6 @@ def test_gemini_text_generator_multimodal_input( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", ), From 190390b804c2131c2eaa624d7f025febb7784b01 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 21 May 2025 16:29:12 -0700 Subject: [PATCH 28/52] fix: support JSON and STRUCT for bbq.sql_scalar (#1754) Fixes internal issue 416015997 --- bigframes/bigquery/_operations/sql.py | 13 +- bigframes/core/compile/sqlglot/sqlglot_ir.py | 7 ++ bigframes/dtypes.py | 27 ---- tests/system/small/bigquery/test_sql.py | 117 +++++++++++++++++- .../test_compile_readlocal/out.sql | 12 +- .../ibis/backends/bigquery/datatypes.py | 2 + 6 files changed, 136 insertions(+), 42 deletions(-) diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 7ccf63fcda..a84c074e01 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -20,6 +20,7 @@ import google.cloud.bigquery +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.sql import bigframes.dataframe import bigframes.dtypes @@ -72,16 +73,16 @@ def sql_scalar( # Another benefit of this is that if there is a syntax error in the SQL # template, then this will fail with an error earlier in the process, # aiding users in debugging. - base_series = columns[0] - literals = [ - bigframes.dtypes.bigframes_dtype_to_literal(column.dtype) for column in columns + literals_sql = [ + sqlglot_ir._literal(None, column.dtype).sql(dialect="bigquery") + for column in columns ] - literals_sql = [bigframes.core.sql.simple_literal(literal) for literal in literals] + select_sql = sql_template.format(*literals_sql) + dry_run_sql = f"SELECT {select_sql}" # Use the executor directly, because we want the original column IDs, not # the user-friendly column names that block.to_sql_query() would produce. - select_sql = sql_template.format(*literals_sql) - dry_run_sql = f"SELECT {select_sql}" + base_series = columns[0] bqclient = base_series._session.bqclient job = bqclient.query( dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 935ad393f8..bd1d225d65 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -18,6 +18,7 @@ import typing from google.cloud import bigquery +import numpy as np import pyarrow as pa import sqlglot as sg import sqlglot.dialects.bigquery @@ -213,7 +214,11 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: elif dtype == dtypes.BYTES_DTYPE: return _cast(str(value), sqlglot_type) elif dtypes.is_time_like(dtype): + if isinstance(value, np.generic): + value = value.item() return _cast(sge.convert(value.isoformat()), sqlglot_type) + elif dtype in (dtypes.NUMERIC_DTYPE, dtypes.BIGNUMERIC_DTYPE): + return _cast(sge.convert(value), sqlglot_type) elif dtypes.is_geo_like(dtype): wkt = value if isinstance(value, str) else to_wkt(value) return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt)) @@ -234,6 +239,8 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: ) return values if len(value) > 0 else _cast(values, sqlglot_type) else: + if isinstance(value, np.generic): + value = value.item() return sge.convert(value) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index ec115a93d0..262fa9dde7 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -499,33 +499,6 @@ def bigframes_dtype_to_arrow_dtype( ) -def bigframes_dtype_to_literal( - bigframes_dtype: Dtype, -) -> Any: - """Create a representative literal value for a bigframes dtype. - - The inverse of infer_literal_type(). - """ - if isinstance(bigframes_dtype, pd.ArrowDtype): - arrow_type = bigframes_dtype.pyarrow_dtype - return arrow_type_to_literal(arrow_type) - - if isinstance(bigframes_dtype, pd.Float64Dtype): - return 1.0 - if isinstance(bigframes_dtype, pd.Int64Dtype): - return 1 - if isinstance(bigframes_dtype, pd.BooleanDtype): - return True - if isinstance(bigframes_dtype, pd.StringDtype): - return "string" - if isinstance(bigframes_dtype, gpd.array.GeometryDtype): - return shapely.geometry.Point((0, 0)) - - raise TypeError( - f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" - ) - - def arrow_type_to_literal( arrow_type: pa.DataType, ) -> Any: diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py index 283624100a..c519b427fa 100644 --- a/tests/system/small/bigquery/test_sql.py +++ b/tests/system/small/bigquery/test_sql.py @@ -12,11 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.bigquery +import pandas as pd +import pytest +import bigframes.bigquery as bbq +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd -def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): - series = bigframes.bigquery.sql_scalar( + +def test_sql_scalar_for_all_scalar_types(scalars_df_null_index): + series = bbq.sql_scalar( """ CAST({0} AS INT64) + BYTE_LENGTH({1}) @@ -48,3 +53,109 @@ def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): ) result = series.to_pandas() assert len(result) == len(scalars_df_null_index) + + +def test_sql_scalar_for_bool_series(scalars_df_index): + series: bpd.Series = scalars_df_index["bool_col"] + result = bbq.sql_scalar("CAST({0} AS INT64)", [series]) + expected = series.astype(dtypes.INT_DTYPE) + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +@pytest.mark.parametrize( + ("column_name"), + [ + pytest.param("bool_col"), + pytest.param("bytes_col"), + pytest.param("date_col"), + pytest.param("datetime_col"), + pytest.param("geography_col"), + pytest.param("int64_col"), + pytest.param("numeric_col"), + pytest.param("float64_col"), + pytest.param("string_col"), + pytest.param("time_col"), + pytest.param("timestamp_col"), + ], +) +def test_sql_scalar_outputs_all_scalar_types(scalars_df_index, column_name): + series: bpd.Series = scalars_df_index[column_name] + result = bbq.sql_scalar("{0}", [series]) + expected = series + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_array_series(repeated_df): + result = bbq.sql_scalar( + """ + ARRAY_LENGTH({0}) + ARRAY_LENGTH({1}) + ARRAY_LENGTH({2}) + + ARRAY_LENGTH({3}) + ARRAY_LENGTH({4}) + ARRAY_LENGTH({5}) + + ARRAY_LENGTH({6}) + """, + [ + repeated_df["int_list_col"], + repeated_df["bool_list_col"], + repeated_df["float_list_col"], + repeated_df["date_list_col"], + repeated_df["date_time_list_col"], + repeated_df["numeric_list_col"], + repeated_df["string_list_col"], + ], + ) + + expected = ( + repeated_df["int_list_col"].list.len() + + repeated_df["bool_list_col"].list.len() + + repeated_df["float_list_col"].list.len() + + repeated_df["date_list_col"].list.len() + + repeated_df["date_time_list_col"].list.len() + + repeated_df["numeric_list_col"].list.len() + + repeated_df["string_list_col"].list.len() + ) + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_outputs_array_series(repeated_df): + result = bbq.sql_scalar("{0}", [repeated_df["int_list_col"]]) + expected = repeated_df["int_list_col"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_struct_series(nested_structs_df): + result = bbq.sql_scalar( + "CHAR_LENGTH({0}.name) + {0}.age", + [nested_structs_df["person"]], + ) + expected = nested_structs_df["person"].struct.field( + "name" + ).str.len() + nested_structs_df["person"].struct.field("age") + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_outputs_struct_series(nested_structs_df): + result = bbq.sql_scalar("{0}", [nested_structs_df["person"]]) + expected = nested_structs_df["person"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_json_series(json_df): + result = bbq.sql_scalar( + """JSON_VALUE({0}, '$.int_value')""", + [ + json_df["json_col"], + ], + ) + expected = bbq.json_value(json_df["json_col"], "$.int_value") + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_outputs_json_series(json_df): + result = bbq.sql_scalar("{0}", [json_df["json_col"]]) + expected = json_df["json_col"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index f04f9ed023..f73ef34051 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -10,7 +10,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'), 123456789, 0, - 1.234567890, + CAST(1.234567890 AS NUMERIC), 1.25, 0, 0, @@ -27,7 +27,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-71.104 42.315)'), -987654321, 1, - 1.234567890, + CAST(1.234567890 AS NUMERIC), 2.51, 1, 1, @@ -44,7 +44,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'), 314159, 0, - 101.101010100, + CAST(101.101010100 AS NUMERIC), 25000000000.0, 2, 2, @@ -95,7 +95,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS GEOGRAPHY), 55555, 0, - 5.555555000, + CAST(5.555555000 AS NUMERIC), 555.555, 5, 5, @@ -112,7 +112,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'), 101202303, 2, - -10.090807000, + CAST(-10.090807000 AS NUMERIC), -123.456, 6, 6, @@ -129,7 +129,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS GEOGRAPHY), -214748367, 2, - 11111111.100000000, + CAST(11111111.100000000 AS NUMERIC), 42.42, 7, 7, diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py index 5b4e4d85a1..fba0339ae9 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py @@ -53,6 +53,8 @@ def from_ibis(cls, dtype: dt.DataType) -> str: ) elif dtype.is_integer(): return "INT64" + elif dtype.is_boolean(): + return "BOOLEAN" elif dtype.is_binary(): return "BYTES" elif dtype.is_string(): From bb511475b74cc253230725846098a9045be2e324 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 21 May 2025 18:01:07 -0700 Subject: [PATCH 29/52] feat: include bq schema and query string in dry run results (#1752) * feat: include bq schema and query string in dry run results * rename key * fix tests --- bigframes/session/dry_runs.py | 14 ++++++++++++-- tests/system/small/test_dataframe_io.py | 3 ++- tests/system/small/test_index_io.py | 5 ++++- tests/system/small/test_series.py | 3 ++- tests/system/small/test_session.py | 3 +++ 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index caf3dfc2bb..51e8e72c9a 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -37,6 +37,10 @@ def get_table_stats(table: bigquery.Table) -> pandas.Series: index.append("columnDtypes") values.append(col_dtypes) + # Add raw BQ schema + index.append("bigquerySchema") + values.append(table.schema) + for key in ("numBytes", "numRows", "location", "type"): index.append(key) values.append(table._properties[key]) @@ -96,8 +100,12 @@ def get_query_stats( ) -> pandas.Series: """Returns important stats from the query job as a Pandas Series.""" - index = [] - values = [] + index: List[Any] = [] + values: List[Any] = [] + + # Add raw BQ schema + index.append("bigquerySchema") + values.append(query_job.schema) job_api_repr = copy.deepcopy(query_job._properties) @@ -110,6 +118,8 @@ def get_query_stats( configuration = job_api_repr.get("configuration", {}) index.append("jobType") values.append(configuration.get("jobType", None)) + index.append("dispatchedSql") + values.append(configuration.get("query", {}).get("query", None)) query_config = configuration.get("query", {}) for key in ("destinationTable", "useLegacySql"): diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 857bec67c0..fac3e9f4b8 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -319,7 +319,8 @@ def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): result = bf_df.to_pandas(dry_run=True) - assert len(result) == 14 + assert isinstance(result, pd.Series) + assert len(result) > 0 def test_to_arrow_override_global_option(scalars_df_index): diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index 78e561c2fd..306b15e67a 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd + import bigframes @@ -35,7 +37,8 @@ def test_to_pandas_dry_run(scalars_df_index): result = index.to_pandas(dry_run=True) - assert len(result) == 14 + assert isinstance(result, pd.Series) + assert len(result) > 0 def test_to_numpy_override_global_option(scalars_df_index): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 286a480d18..b4c24e4ba9 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4604,4 +4604,5 @@ def test_series_to_pandas_dry_run(scalars_df_index): result = bf_series.to_pandas(dry_run=True) - assert len(result) == 14 + assert isinstance(result, pd.Series) + assert len(result) > 0 diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index eeb242e8da..2a58061607 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1899,9 +1899,11 @@ def _assert_query_dry_run_stats_are_valid(result: pd.Series): "columnDtypes", "indexLevel", "indexDtypes", + "bigquerySchema", "projectId", "location", "jobType", + "dispatchedSql", "destinationTable", "useLegacySql", "referencedTables", @@ -1922,6 +1924,7 @@ def _assert_table_dry_run_stats_are_valid(result: pd.Series): "isQuery", "columnCount", "columnDtypes", + "bigquerySchema", "numBytes", "numRows", "location", From 1f6442e576c35ec784ccf9cab3d081d46e45a5ce Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 22 May 2025 08:54:19 -0700 Subject: [PATCH 30/52] feat: Add deferred data uploading (#1720) --- bigframes/constants.py | 4 + bigframes/core/array_value.py | 11 +- bigframes/core/nodes.py | 20 +++- bigframes/session/__init__.py | 26 +++-- bigframes/session/bq_caching_executor.py | 85 ++++++++++++++- bigframes/session/loader.py | 113 ++++++++++++-------- tests/system/small/test_large_local_data.py | 55 ++++++++++ tests/unit/session/test_session.py | 2 +- third_party/bigframes_vendored/constants.py | 1 + 9 files changed, 253 insertions(+), 64 deletions(-) create mode 100644 tests/system/small/test_large_local_data.py diff --git a/bigframes/constants.py b/bigframes/constants.py index 89f27afd78..b6e0b8b221 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -128,4 +128,8 @@ # BigQuery default is 10000, leave 100 for overhead MAX_COLUMNS = 9900 +# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table. +# Also must assume that text encoding as literals is much less efficient than in-memory representation. +MAX_INLINE_BYTES = 5000 + SUGGEST_PEEK_PREVIEW = "Use .peek(n) to preview n arbitrary rows." diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 60f5315554..20773fd1b4 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -133,8 +133,17 @@ def from_table( ordering=ordering, n_rows=n_rows, ) + return cls.from_bq_data_source(source_def, scan_list, session) + + @classmethod + def from_bq_data_source( + cls, + source: nodes.BigqueryDataSource, + scan_list: nodes.ScanList, + session: Session, + ): node = nodes.ReadTableNode( - source=source_def, + source=source, scan_list=scan_list, table_session=session, ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 0fbfe7bd37..3e4bdb57c4 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -578,6 +578,9 @@ class ScanItem(typing.NamedTuple): def with_id(self, id: identifiers.ColumnId) -> ScanItem: return ScanItem(id, self.dtype, self.source_id) + def with_source_id(self, source_id: str) -> ScanItem: + return ScanItem(self.id, self.dtype, source_id) + @dataclasses.dataclass(frozen=True) class ScanList: @@ -614,6 +617,21 @@ def project( result = ScanList((self.items[:1])) return result + def remap_source_ids( + self, + mapping: Mapping[str, str], + ) -> ScanList: + items = tuple( + item.with_source_id(mapping.get(item.source_id, item.source_id)) + for item in self.items + ) + return ScanList(items) + + def append( + self, source_id: str, dtype: bigframes.dtypes.Dtype, id: identifiers.ColumnId + ) -> ScanList: + return ScanList((*self.items, ScanItem(id, dtype, source_id))) + @dataclasses.dataclass(frozen=True, eq=False) class ReadLocalNode(LeafNode): @@ -621,9 +639,9 @@ class ReadLocalNode(LeafNode): local_data_source: local_data.ManagedArrowTable # Mapping of local ids to bfet id. scan_list: ScanList + session: bigframes.session.Session # Offsets are generated only if this is non-null offsets_col: Optional[identifiers.ColumnId] = None - session: typing.Optional[bigframes.session.Session] = None @property def fields(self) -> Sequence[Field]: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7630e71eaa..f86ba6ddc8 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -60,6 +60,7 @@ from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients +import bigframes.constants from bigframes.core import blocks, log_adapter import bigframes.core.pyformat @@ -248,13 +249,6 @@ def __init__( self._temp_storage_manager = ( self._session_resource_manager or self._anon_dataset_manager ) - self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( - bqclient=self._clients_provider.bqclient, - bqstoragereadclient=self._clients_provider.bqstoragereadclient, - storage_manager=self._temp_storage_manager, - strictly_ordered=self._strictly_ordered, - metrics=self._metrics, - ) self._loader = bigframes.session.loader.GbqDataLoader( session=self, bqclient=self._clients_provider.bqclient, @@ -265,6 +259,14 @@ def __init__( force_total_order=self._strictly_ordered, metrics=self._metrics, ) + self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( + bqclient=self._clients_provider.bqclient, + bqstoragereadclient=self._clients_provider.bqstoragereadclient, + loader=self._loader, + storage_manager=self._temp_storage_manager, + strictly_ordered=self._strictly_ordered, + metrics=self._metrics, + ) def __del__(self): """Automatic cleanup of internal resources.""" @@ -937,15 +939,15 @@ def _read_pandas( if write_engine == "default": write_engine = ( "bigquery_load" - if mem_usage > MAX_INLINE_DF_BYTES + if mem_usage > bigframes.constants.MAX_INLINE_BYTES else "bigquery_inline" ) if write_engine == "bigquery_inline": - if mem_usage > MAX_INLINE_DF_BYTES: + if mem_usage > bigframes.constants.MAX_INLINE_BYTES: raise ValueError( f"DataFrame size ({mem_usage} bytes) exceeds the maximum allowed " - f"for inline data ({MAX_INLINE_DF_BYTES} bytes)." + f"for inline data ({bigframes.constants.MAX_INLINE_BYTES} bytes)." ) return self._read_pandas_inline(pandas_dataframe) elif write_engine == "bigquery_load": @@ -954,6 +956,10 @@ def _read_pandas( return self._loader.read_pandas(pandas_dataframe, method="stream") elif write_engine == "bigquery_write": return self._loader.read_pandas(pandas_dataframe, method="write") + elif write_engine == "_deferred": + import bigframes.dataframe as dataframe + + return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe, self)) else: raise ValueError(f"Got unexpected write_engine '{write_engine}'") diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 118838c059..33d3314a1e 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -17,6 +17,7 @@ import dataclasses import math import os +import threading from typing import cast, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings import weakref @@ -27,8 +28,9 @@ import google.cloud.bigquery.table as bq_table import google.cloud.bigquery_storage_v1 +import bigframes.constants import bigframes.core -from bigframes.core import compile, rewrite +from bigframes.core import compile, local_data, rewrite import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.guid import bigframes.core.nodes as nodes @@ -38,7 +40,7 @@ import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.features -from bigframes.session import executor, local_scan_executor, read_api_execution +from bigframes.session import executor, loader, local_scan_executor, read_api_execution import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics import bigframes.session.planner @@ -67,12 +69,19 @@ def _get_default_output_spec() -> OutputSpec: ) +SourceIdMapping = Mapping[str, str] + + class ExecutionCache: def __init__(self): # current assumption is only 1 cache of a given node # in future, might have multiple caches, with different layout, localities self._cached_executions: weakref.WeakKeyDictionary[ - nodes.BigFrameNode, nodes.BigFrameNode + nodes.BigFrameNode, nodes.CachedTableNode + ] = weakref.WeakKeyDictionary() + self._uploaded_local_data: weakref.WeakKeyDictionary[ + local_data.ManagedArrowTable, + tuple[nodes.BigqueryDataSource, SourceIdMapping], ] = weakref.WeakKeyDictionary() @property @@ -105,6 +114,19 @@ def cache_results_table( assert original_root.schema == cached_replacement.schema self._cached_executions[original_root] = cached_replacement + def cache_remote_replacement( + self, + local_data: local_data.ManagedArrowTable, + bq_data: nodes.BigqueryDataSource, + ): + # bq table has one extra column for offsets, those are implicit for local data + assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema) + mapping = { + local_data.schema.items[i].column: bq_data.table.physical_schema[i].name + for i in range(len(local_data.schema)) + } + self._uploaded_local_data[local_data] = (bq_data, mapping) + class BigQueryCachingExecutor(executor.Executor): """Computes BigFrames values using BigQuery Engine. @@ -120,6 +142,7 @@ def __init__( bqclient: bigquery.Client, storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager, bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, + loader: loader.GbqDataLoader, *, strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, @@ -129,6 +152,7 @@ def __init__( self.strictly_ordered: bool = strictly_ordered self.cache: ExecutionCache = ExecutionCache() self.metrics = metrics + self.loader = loader self.bqstoragereadclient = bqstoragereadclient # Simple left-to-right precedence for now self._semi_executors = ( @@ -138,6 +162,7 @@ def __init__( ), local_scan_executor.LocalScanExecutor(), ) + self._upload_lock = threading.Lock() def to_sql( self, @@ -149,6 +174,7 @@ def to_sql( if offset_column: array_value, _ = array_value.promote_offsets() node = self.logical_plan(array_value.node) if enable_cache else array_value.node + node = self._substitute_large_local_sources(node) compiled = compile.compile_sql(compile.CompileRequest(node, sort_rows=ordered)) return compiled.sql @@ -402,6 +428,7 @@ def _cache_with_cluster_cols( ): """Executes the query and uses the resulting table to rewrite future executions.""" plan = self.logical_plan(array_value.node) + plan = self._substitute_large_local_sources(plan) compiled = compile.compile_sql( compile.CompileRequest( plan, sort_rows=False, materialize_all_order_keys=True @@ -422,7 +449,7 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): w_offsets, offset_column = array_value.promote_offsets() compiled = compile.compile_sql( compile.CompileRequest( - self.logical_plan(w_offsets.node), + self.logical_plan(self._substitute_large_local_sources(w_offsets.node)), sort_rows=False, ) ) @@ -532,6 +559,54 @@ def _validate_result_schema( f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" ) + def _substitute_large_local_sources(self, original_root: nodes.BigFrameNode): + """ + Replace large local sources with the uploaded version of those datasources. + """ + # Step 1: Upload all previously un-uploaded data + for leaf in original_root.unique_nodes(): + if isinstance(leaf, nodes.ReadLocalNode): + if ( + leaf.local_data_source.metadata.total_bytes + > bigframes.constants.MAX_INLINE_BYTES + ): + self._upload_local_data(leaf.local_data_source) + + # Step 2: Replace local scans with remote scans + def map_local_scans(node: nodes.BigFrameNode): + if not isinstance(node, nodes.ReadLocalNode): + return node + if node.local_data_source not in self.cache._uploaded_local_data: + return node + bq_source, source_mapping = self.cache._uploaded_local_data[ + node.local_data_source + ] + scan_list = node.scan_list.remap_source_ids(source_mapping) + # offsets_col isn't part of ReadTableNode, so emulate by adding to end of scan_list + if node.offsets_col is not None: + # Offsets are always implicitly the final column of uploaded data + # See: Loader.load_data + scan_list = scan_list.append( + bq_source.table.physical_schema[-1].name, + bigframes.dtypes.INT_DTYPE, + node.offsets_col, + ) + return nodes.ReadTableNode(bq_source, scan_list, node.session) + + return original_root.bottom_up(map_local_scans) + + def _upload_local_data(self, local_table: local_data.ManagedArrowTable): + if local_table in self.cache._uploaded_local_data: + return + # Lock prevents concurrent repeated work, but slows things down. + # Might be better as a queue and a worker thread + with self._upload_lock: + if local_table not in self.cache._uploaded_local_data: + uploaded = self.loader.load_data( + local_table, bigframes.core.guid.generate_guid() + ) + self.cache.cache_remote_replacement(local_table, uploaded) + def _execute_plan( self, plan: nodes.BigFrameNode, @@ -562,6 +637,8 @@ def _execute_plan( # Use explicit destination to avoid 10GB limit of temporary table if destination_table is not None: job_config.destination = destination_table + + plan = self._substitute_large_local_sources(plan) compiled = compile.compile_sql( compile.CompileRequest(plan, sort_rows=ordered, peek_count=peek) ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 1e32f3d860..ce5d3d66b6 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -44,7 +44,7 @@ import pandas import pyarrow as pa -from bigframes.core import guid, local_data, utils +from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.schema as schemata @@ -183,34 +183,55 @@ def read_pandas( [*idx_cols, *val_cols], axis="columns" ) managed_data = local_data.ManagedArrowTable.from_pandas(prepared_df) + block = blocks.Block( + self.read_managed_data(managed_data, method=method), + index_columns=idx_cols, + column_labels=pandas_dataframe.columns, + index_labels=pandas_dataframe.index.names, + ) + return dataframe.DataFrame(block) + def read_managed_data( + self, + data: local_data.ManagedArrowTable, + method: Literal["load", "stream", "write"], + ) -> core.ArrayValue: + offsets_col = guid.generate_guid("upload_offsets_") if method == "load": - array_value = self.load_data(managed_data) + gbq_source = self.load_data(data, offsets_col=offsets_col) elif method == "stream": - array_value = self.stream_data(managed_data) + gbq_source = self.stream_data(data, offsets_col=offsets_col) elif method == "write": - array_value = self.write_data(managed_data) + gbq_source = self.write_data(data, offsets_col=offsets_col) else: raise ValueError(f"Unsupported read method {method}") - block = blocks.Block( - array_value, - index_columns=idx_cols, - column_labels=pandas_dataframe.columns, - index_labels=pandas_dataframe.index.names, + return core.ArrayValue.from_bq_data_source( + source=gbq_source, + scan_list=nodes.ScanList( + tuple( + nodes.ScanItem( + identifiers.ColumnId(item.column), item.dtype, item.column + ) + for item in data.schema.items + ) + ), + session=self._session, ) - return dataframe.DataFrame(block) - def load_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + def load_data( + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> nodes.BigqueryDataSource: """Load managed data into bigquery""" - ordering_col = guid.generate_guid("load_offsets_") # JSON support incomplete for item in data.schema.items: _validate_dtype_can_load(item.column, item.dtype) schema_w_offsets = data.schema.append( - schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) ) bq_schema = schema_w_offsets.to_bigquery(_LOAD_JOB_TYPE_OVERRIDES) @@ -226,13 +247,13 @@ def load_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: job_config.schema = bq_schema load_table_destination = self._storage_manager.create_temp_table( - bq_schema, [ordering_col] + bq_schema, [offsets_col] ) buffer = io.BytesIO() data.to_parquet( buffer, - offsets_col=ordering_col, + offsets_col=offsets_col, geo_format="wkt", duration_type="duration", json_type="string", @@ -244,23 +265,24 @@ def load_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: self._start_generic_job(load_job) # must get table metadata after load job for accurate metadata destination_table = self._bqclient.get_table(load_table_destination) - return core.ArrayValue.from_table( - table=destination_table, - schema=schema_w_offsets, - session=self._session, - offsets_col=ordering_col, - n_rows=data.data.num_rows, - ).drop_columns([ordering_col]) + return nodes.BigqueryDataSource( + nodes.GbqTable.from_table(destination_table), + ordering=ordering.TotalOrdering.from_offset_col(offsets_col), + n_rows=data.metadata.row_count, + ) - def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + def stream_data( + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> nodes.BigqueryDataSource: """Load managed data into bigquery""" - ordering_col = guid.generate_guid("stream_offsets_") schema_w_offsets = data.schema.append( - schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) ) bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES) load_table_destination = self._storage_manager.create_temp_table( - bq_schema, [ordering_col] + bq_schema, [offsets_col] ) rows = data.itertuples( @@ -279,24 +301,23 @@ def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}" ) destination_table = self._bqclient.get_table(load_table_destination) - return core.ArrayValue.from_table( - table=destination_table, - schema=schema_w_offsets, - session=self._session, - offsets_col=ordering_col, - n_rows=data.data.num_rows, - ).drop_columns([ordering_col]) + return nodes.BigqueryDataSource( + nodes.GbqTable.from_table(destination_table), + ordering=ordering.TotalOrdering.from_offset_col(offsets_col), + n_rows=data.metadata.row_count, + ) - def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + def write_data( + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> nodes.BigqueryDataSource: """Load managed data into bigquery""" - ordering_col = guid.generate_guid("stream_offsets_") schema_w_offsets = data.schema.append( - schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) ) bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES) - bq_table_ref = self._storage_manager.create_temp_table( - bq_schema, [ordering_col] - ) + bq_table_ref = self._storage_manager.create_temp_table(bq_schema, [offsets_col]) requested_stream = bq_storage_types.stream.WriteStream() requested_stream.type_ = bq_storage_types.stream.WriteStream.Type.COMMITTED # type: ignore @@ -308,7 +329,7 @@ def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]: schema, batches = data.to_arrow( - offsets_col=ordering_col, duration_type="int" + offsets_col=offsets_col, duration_type="int" ) offset = 0 for batch in batches: @@ -334,13 +355,11 @@ def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]: assert response.row_count == data.data.num_rows destination_table = self._bqclient.get_table(bq_table_ref) - return core.ArrayValue.from_table( - table=destination_table, - schema=schema_w_offsets, - session=self._session, - offsets_col=ordering_col, - n_rows=data.data.num_rows, - ).drop_columns([ordering_col]) + return nodes.BigqueryDataSource( + nodes.GbqTable.from_table(destination_table), + ordering=ordering.TotalOrdering.from_offset_col(offsets_col), + n_rows=data.metadata.row_count, + ) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: diff --git a/tests/system/small/test_large_local_data.py b/tests/system/small/test_large_local_data.py new file mode 100644 index 0000000000..eddec37132 --- /dev/null +++ b/tests/system/small/test_large_local_data.py @@ -0,0 +1,55 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +import bigframes +from tests.system.utils import assert_pandas_df_equal + +large_dataframe = pd.DataFrame(np.random.rand(10000, 10), dtype="Float64") +large_dataframe.index = large_dataframe.index.astype("Int64") + + +def test_read_pandas_defer_noop(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + + assert_pandas_df_equal(large_dataframe, bf_df.to_pandas()) + + +def test_read_pandas_defer_cumsum(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + bf_df = bf_df.cumsum() + + assert_pandas_df_equal(large_dataframe.cumsum(), bf_df.to_pandas()) + + +def test_read_pandas_defer_cache_cumsum_cumsum(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + bf_df = bf_df.cumsum().cache().cumsum() + + assert_pandas_df_equal(large_dataframe.cumsum().cumsum(), bf_df.to_pandas()) + + +def test_read_pandas_defer_peek(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + bf_result = bf_df.peek(15) + + assert len(bf_result) == 15 + assert_pandas_df_equal(large_dataframe.loc[bf_result.index], bf_result) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 91b6679702..dc8ee2c0d9 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -484,7 +484,7 @@ def today(cls): mocks.create_bigquery_session() -@mock.patch("bigframes.session.MAX_INLINE_DF_BYTES", 1) +@mock.patch("bigframes.constants.MAX_INLINE_BYTES", 1) def test_read_pandas_inline_exceeds_limit_raises_error(): session = mocks.create_bigquery_session() pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index af87694cd5..6d55817a27 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -52,5 +52,6 @@ "bigquery_load", "bigquery_streaming", "bigquery_write", + "_deferred", ] VALID_WRITE_ENGINES = typing.get_args(WriteEngineType) From 50dca4c706d78673b03f90eccf776118247ba30b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 22 May 2025 11:13:14 -0700 Subject: [PATCH 31/52] feat: support dtype parameter in read_csv for bigquery engine (#1749) Fixes internal issue 404530013 --- bigframes/session/__init__.py | 17 +++++++------ bigframes/session/loader.py | 7 +++--- tests/system/small/test_session.py | 39 ++++++++++++++++++++++++++++++ tests/unit/session/test_session.py | 16 ++++++++---- 4 files changed, 64 insertions(+), 15 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f86ba6ddc8..46d71a079e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -61,7 +61,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.clients import bigframes.constants -from bigframes.core import blocks, log_adapter +from bigframes.core import blocks, log_adapter, utils import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -1108,11 +1108,8 @@ def _read_csv_w_bigquery_engine( native CSV loading capabilities, making it suitable for large datasets that may not fit into local memory. """ - if dtype is not None: - raise NotImplementedError( - f"BigQuery engine does not support the `dtype` argument." - f"{constants.FEEDBACK_LINK}" - ) + if dtype is not None and not utils.is_dict_like(dtype): + raise ValueError("dtype should be a dict-like object.") if names is not None: if len(names) != len(set(names)): @@ -1167,10 +1164,16 @@ def _read_csv_w_bigquery_engine( job_config.skip_leading_rows = header + 1 table_id = self._loader.load_file(filepath_or_buffer, job_config=job_config) - return self._loader.read_gbq_table( + df = self._loader.read_gbq_table( table_id, index_col=index_col, columns=columns, names=names ) + if dtype is not None: + for column, dtype in dtype.items(): + if column in df.columns: + df[column] = df[column].astype(dtype) + return df + def read_pickle( self, filepath_or_buffer: FilePath | ReadPickleBuffer, diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index ce5d3d66b6..8b0a1266ce 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -663,9 +663,10 @@ def read_gbq_table( renamed_cols: Dict[str, str] = { col: new_name for col, new_name in zip(array_value.column_ids, names) } - index_names = [ - renamed_cols.get(index_col, index_col) for index_col in index_cols - ] + if index_col != bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: + index_names = [ + renamed_cols.get(index_col, index_col) for index_col in index_cols + ] value_columns = [renamed_cols.get(col, col) for col in value_columns] block = blocks.Block( diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2a58061607..dfb69d628e 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1369,6 +1369,45 @@ def test_read_csv_for_names_and_index_col( ) +def test_read_csv_for_dtype(session, df_and_gcs_csv_for_two_columns): + _, path = df_and_gcs_csv_for_two_columns + + dtype = {"bool_col": pd.BooleanDtype(), "int64_col": pd.Float64Dtype()} + bf_df = session.read_csv(path, engine="bigquery", dtype=dtype) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv(path, dtype=dtype) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("rowindex").sort_index() + pd_df = pd_df.set_index("rowindex") + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_dtype_w_names(session, df_and_gcs_csv_for_two_columns): + _, path = df_and_gcs_csv_for_two_columns + + names = ["a", "b", "c"] + dtype = {"b": pd.BooleanDtype(), "c": pd.Float64Dtype()} + bf_df = session.read_csv(path, engine="bigquery", names=names, dtype=dtype) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv(path, names=names, dtype=dtype) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("a").sort_index() + pd_df = pd_df.set_index("a") + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + @pytest.mark.parametrize( ("kwargs", "match"), [ diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index dc8ee2c0d9..cbd31f588a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -108,11 +108,6 @@ @pytest.mark.parametrize( ("kwargs", "match"), [ - pytest.param( - {"engine": "bigquery", "dtype": {}}, - "BigQuery engine does not support the `dtype` argument", - id="with_dtype", - ), pytest.param( {"engine": "bigquery", "usecols": [1, 2]}, "BigQuery engine only supports an iterable of strings for `usecols`.", @@ -215,6 +210,17 @@ def test_read_csv_w_bigquery_engine_raises_error_for_invalid_names( session.read_csv("path/to/csv.csv", engine="bigquery", names=names) +def test_read_csv_w_bigquery_engine_raises_error_for_invalid_dtypes(): + session = mocks.create_bigquery_session() + + with pytest.raises(ValueError, match="dtype should be a dict-like object."): + session.read_csv( + "path/to/csv.csv", + engine="bigquery", + dtype=["a", "b", "c"], # type: ignore[arg-type] + ) + + @pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")]) def test_read_gbq_missing_parts(missing_parts_table_id): session = mocks.create_bigquery_session() From 1cfbb4798d3113532f3151fe3f37fbb097ef7ded Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 23 May 2025 13:58:44 -0700 Subject: [PATCH 32/52] chore: add individual timeout for unit-tests (#1768) --- noxfile.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/noxfile.py b/noxfile.py index 5c20487ccd..3f393f01da 100644 --- a/noxfile.py +++ b/noxfile.py @@ -70,9 +70,10 @@ "mock", "asyncmock", PYTEST_VERSION, - "pytest-cov", "pytest-asyncio", + "pytest-cov", "pytest-mock", + "pytest-timeout", ] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] @@ -228,6 +229,10 @@ def run_unit(session, install_test_extra): session.run( "py.test", "--quiet", + # Any individual test taking longer than 1 mins will be terminated. + "--timeout=60", + # Log 20 slowest tests + "--durations=20", f"--junitxml=unit_{session.python}_sponge_log.xml", "--cov=bigframes", f"--cov={tests_path}", @@ -355,7 +360,7 @@ def run_system( # Run py.test against the system tests. pytest_cmd = [ "py.test", - "--quiet", + "-v", f"-n={num_workers}", # Any individual test taking longer than 15 mins will be terminated. f"--timeout={timeout_seconds}", From 15f3f2aa42cfe4a2233f62c5f8906e7f7658f9fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 27 May 2025 15:55:53 -0500 Subject: [PATCH 33/52] perf: use JOB_CREATION_OPTIONAL when `allow_large_results=False` (#1763) --- bigframes/session/clients.py | 7 +++++++ tests/unit/session/test_clients.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index a8e1ab71f1..86312eb9ba 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -181,6 +181,13 @@ def _create_bigquery_client(self): location=self._location, ) + # If a new enough client library is available, we opt-in to the faster + # backend behavior. This only affects code paths where query_and_wait is + # used, which doesn't expose a query job directly. See internal issue + # b/417985981. + if hasattr(bq_client, "default_job_creation_mode"): + bq_client.default_job_creation_mode = "JOB_CREATION_OPTIONAL" + if self._bq_kms_key_name: # Note: Key configuration only applies automatically to load and query jobs, not copy jobs. encryption_config = bigquery.EncryptionConfiguration( diff --git a/tests/unit/session/test_clients.py b/tests/unit/session/test_clients.py index c9a12be584..5d577a52ed 100644 --- a/tests/unit/session/test_clients.py +++ b/tests/unit/session/test_clients.py @@ -46,6 +46,8 @@ def create_clients_provider(application_name: Optional[str] = None): def monkeypatch_client_constructors(monkeypatch): bqclient = mock.create_autospec(google.cloud.bigquery.Client) bqclient.return_value = bqclient + # Assume we have a new client library in the unit tests. + bqclient.default_job_creation_mode = None # type: ignore monkeypatch.setattr(google.cloud.bigquery, "Client", bqclient) bqconnectionclient = mock.create_autospec( @@ -83,6 +85,11 @@ def monkeypatch_client_constructors(monkeypatch): ) +def assert_bqclient_sets_default_job_creation_mode(provider: clients.ClientsProvider): + bqclient = provider.bqclient + assert bqclient.default_job_creation_mode == "JOB_CREATION_OPTIONAL" + + def assert_constructed_w_user_agent(mock_client: mock.Mock, expected_user_agent: str): assert ( expected_user_agent From 393425e5a4b06c8c3193075d0a59d269115a1635 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 28 May 2025 08:58:23 -0700 Subject: [PATCH 34/52] test: ignore `bigframes/testing` folder from testing (#1767) This is to disable failures like the following: _____________ ERROR collecting bigframes/testing/polars_session.py _____________ bigframes/testing/polars_session.py:19: in import polars E ModuleNotFoundError: No module named 'polars' --- noxfile.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/noxfile.py b/noxfile.py index 3f393f01da..2fd437f469 100644 --- a/noxfile.py +++ b/noxfile.py @@ -429,6 +429,8 @@ def doctest(session: nox.sessions.Session): "third_party/bigframes_vendored/ibis", "--ignore", "bigframes/core/compile/polars", + "--ignore", + "bigframes/testing", ), test_folder="bigframes", check_cov=True, From ad5b98c5705abdb5ff5ba0dd2f02937fb22405fa Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 28 May 2025 09:55:11 -0700 Subject: [PATCH 35/52] refactor: compile the selected_cols for the ResultNode (#1765) --- bigframes/core/compile/sqlglot/compiler.py | 9 ++- bigframes/core/compile/sqlglot/sqlglot_ir.py | 74 ++++++++++++++++++- .../test_compile_projection/out.sql | 5 +- .../test_compile_readlocal/out.sql | 31 ++++---- .../test_compile_readlocal_w_json_df/out.sql | 3 +- .../test_compile_readlocal_w_lists_df/out.sql | 17 ++--- .../out.sql | 5 +- 7 files changed, 106 insertions(+), 38 deletions(-) diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 91d1fa0d85..953ebf34fd 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -120,7 +120,14 @@ def _remap_variables(self, node: nodes.ResultNode) -> nodes.ResultNode: def _compile_result_node(self, root: nodes.ResultNode) -> str: sqlglot_ir = self.compile_node(root.child) - # TODO: add order_by, limit, and selections to sqlglot_expr + + selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple( + (name, scalar_compiler.compile_scalar_expression(ref)) + for ref, name in root.output_cols + ) + sqlglot_ir = sqlglot_ir.select(selected_cols) + + # TODO: add order_by, limit to sqlglot_expr return sqlglot_ir.sql @functools.lru_cache(maxsize=5000) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index bd1d225d65..23b441591b 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -128,15 +128,22 @@ def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], ) -> SQLGlotIR: - cols_expr = [ + selections = [ sge.Alias( this=expr, alias=sge.to_identifier(id, quoted=self.quoted), ) for id, expr in selected_cols ] - new_expr = self._encapsulate_as_cte().select(*cols_expr, append=False) - return SQLGlotIR(expr=new_expr) + # Attempts to simplify selected columns when the original and new column + # names are simply aliases of each other. + squashed_selections = _squash_selections(self.expr.expressions, selections) + if squashed_selections != []: + new_expr = self.expr.select(*squashed_selections, append=False) + return SQLGlotIR(expr=new_expr) + else: + new_expr = self._encapsulate_as_cte().select(*selections, append=False) + return SQLGlotIR(expr=new_expr) def project( self, @@ -199,7 +206,7 @@ def _encapsulate_as_cte( this=select_expr, alias=new_cte_name, ) - new_with_clause = sge.With(expressions=existing_ctes + [new_cte]) + new_with_clause = sge.With(expressions=[*existing_ctes, new_cte]) new_select_expr = ( sge.Select().select(sge.Star()).from_(sge.Table(this=new_cte_name)) ) @@ -254,3 +261,62 @@ def _table(table: bigquery.TableReference) -> sge.Table: db=sg.to_identifier(table.dataset_id, quoted=True), catalog=sg.to_identifier(table.project, quoted=True), ) + + +def _squash_selections( + old_expr: list[sge.Expression], new_expr: list[sge.Alias] +) -> list[sge.Alias]: + """ + Simplifies the select column expressions if existing (old_expr) and + new (new_expr) selected columns are both simple aliases of column definitions. + + Example: + old_expr: [A AS X, B AS Y] + new_expr: [X AS P, Y AS Q] + Result: [A AS P, B AS Q] + """ + old_alias_map: typing.Dict[str, str] = {} + for selected in old_expr: + column_alias_pair = _get_column_alias_pair(selected) + if column_alias_pair is None: + return [] + else: + old_alias_map[column_alias_pair[1]] = column_alias_pair[0] + + new_selected_cols: typing.List[sge.Alias] = [] + for selected in new_expr: + column_alias_pair = _get_column_alias_pair(selected) + if column_alias_pair is None or column_alias_pair[0] not in old_alias_map: + return [] + else: + new_alias_expr = sge.Alias( + this=sge.ColumnDef( + this=sge.to_identifier( + old_alias_map[column_alias_pair[0]], quoted=True + ) + ), + alias=sg.to_identifier(column_alias_pair[1], quoted=True), + ) + new_selected_cols.append(new_alias_expr) + return new_selected_cols + + +def _get_column_alias_pair( + expr: sge.Expression, +) -> typing.Optional[typing.Tuple[str, str]]: + """Checks if an expression is a simple alias of a column definition + (e.g., "column_name AS alias_name"). + If it is, returns a tuple containing the alias name and original column name. + Returns `None` otherwise. + """ + if not isinstance(expr, sge.Alias): + return None + if not isinstance(expr.this, sge.ColumnDef): + return None + + column_def_expr: sge.ColumnDef = expr.this + if not isinstance(column_def_expr.this, sge.Identifier): + return None + + original_identifier: sge.Identifier = column_def_expr.this + return (original_identifier.this, expr.alias) diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql index 3430b6684f..f5182a380b 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql @@ -6,7 +6,6 @@ WITH `bfcte_0` AS ( FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0), STRUCT(1, -987654321, 1), STRUCT(2, 314159, 2), STRUCT(3, CAST(NULL AS INT64), 3), STRUCT(4, -234892, 4), STRUCT(5, 55555, 5), STRUCT(6, 101202303, 6), STRUCT(7, -214748367, 7), STRUCT(8, 2, 8)]) ) SELECT - `bfcol_3` AS `bfcol_5`, - `bfcol_4` AS `bfcol_6`, - `bfcol_2` AS `bfcol_7` + `bfcol_3` AS `rowindex`, + `bfcol_4` AS `int64_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index f73ef34051..d7e47b6032 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -157,20 +157,19 @@ WITH `bfcte_0` AS ( )]) ) SELECT - `bfcol_0` AS `bfcol_16`, - `bfcol_1` AS `bfcol_17`, - `bfcol_2` AS `bfcol_18`, - `bfcol_3` AS `bfcol_19`, - `bfcol_4` AS `bfcol_20`, - `bfcol_5` AS `bfcol_21`, - `bfcol_6` AS `bfcol_22`, - `bfcol_7` AS `bfcol_23`, - `bfcol_8` AS `bfcol_24`, - `bfcol_9` AS `bfcol_25`, - `bfcol_10` AS `bfcol_26`, - `bfcol_11` AS `bfcol_27`, - `bfcol_12` AS `bfcol_28`, - `bfcol_13` AS `bfcol_29`, - `bfcol_14` AS `bfcol_30`, - `bfcol_15` AS `bfcol_31` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `bool_col`, + `bfcol_2` AS `bytes_col`, + `bfcol_3` AS `date_col`, + `bfcol_4` AS `datetime_col`, + `bfcol_5` AS `geography_col`, + `bfcol_6` AS `int64_col`, + `bfcol_7` AS `int64_too`, + `bfcol_8` AS `numeric_col`, + `bfcol_9` AS `float64_col`, + `bfcol_10` AS `rowindex_1`, + `bfcol_11` AS `rowindex_2`, + `bfcol_12` AS `string_col`, + `bfcol_13` AS `time_col`, + `bfcol_14` AS `timestamp_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql index c0e5a0a476..31b46e6c70 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -4,6 +4,5 @@ WITH `bfcte_0` AS ( FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) ) SELECT - `bfcol_0` AS `bfcol_2`, - `bfcol_1` AS `bfcol_3` + `bfcol_0` AS `json_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql index c97babdaef..1ba602f205 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql @@ -34,13 +34,12 @@ WITH `bfcte_0` AS ( )]) ) SELECT - `bfcol_0` AS `bfcol_9`, - `bfcol_1` AS `bfcol_10`, - `bfcol_2` AS `bfcol_11`, - `bfcol_3` AS `bfcol_12`, - `bfcol_4` AS `bfcol_13`, - `bfcol_5` AS `bfcol_14`, - `bfcol_6` AS `bfcol_15`, - `bfcol_7` AS `bfcol_16`, - `bfcol_8` AS `bfcol_17` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `bool_list_col`, + `bfcol_3` AS `float_list_col`, + `bfcol_4` AS `date_list_col`, + `bfcol_5` AS `date_time_list_col`, + `bfcol_6` AS `numeric_list_col`, + `bfcol_7` AS `string_list_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql index 509e63e029..54d1a1bb2b 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -20,7 +20,6 @@ WITH `bfcte_0` AS ( )]) ) SELECT - `bfcol_0` AS `bfcol_3`, - `bfcol_1` AS `bfcol_4`, - `bfcol_2` AS `bfcol_5` + `bfcol_0` AS `id`, + `bfcol_1` AS `person` FROM `bfcte_0` \ No newline at end of file From ec81dd2228697d5bf193d86396cf7f3212e0289d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 28 May 2025 11:05:11 -0700 Subject: [PATCH 36/52] feat: add bbq.json_query and warn bbq.json_extract deprecated (#1756) --- bigframes/bigquery/__init__.py | 4 +- bigframes/bigquery/_operations/json.py | 48 ++++++++++++++++++-- bigframes/core/compile/scalar_op_compiler.py | 13 ++++++ bigframes/operations/__init__.py | 2 + bigframes/operations/json_ops.py | 15 ++++++ tests/system/small/bigquery/test_json.py | 31 ++++++++++++- 6 files changed, 108 insertions(+), 5 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 02dd77fdd9..301207bb31 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -37,6 +37,7 @@ json_extract, json_extract_array, json_extract_string_array, + json_query, json_set, json_value, parse_json, @@ -58,10 +59,11 @@ "st_distance", "st_intersection", # json ops - "json_set", "json_extract", "json_extract_array", "json_extract_string_array", + "json_query", + "json_set", "json_value", "parse_json", # search ops diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index b59fe40d99..561fb57348 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -22,9 +22,11 @@ from __future__ import annotations from typing import Any, cast, Optional, Sequence, Tuple, Union +import warnings import bigframes.core.utils as utils import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.operations as ops import bigframes.series as series @@ -87,9 +89,13 @@ def json_extract( input: series.Series, json_path: str, ) -> series.Series: - """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` - value. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. + """Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING`` or + ``JSON`` value. This function uses single quotes and brackets to escape invalid + JSONPath characters in JSON keys. + + .. deprecated:: 2.5.0 + The ``json_extract`` is deprecated and will be removed in a future version. + Use ``json_query`` instead. **Examples:** @@ -111,6 +117,11 @@ def json_extract( Returns: bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. """ + msg = ( + "The `json_extract` is deprecated and will be removed in a future version. " + "Use `json_query` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) return input._apply_unary_op(ops.JSONExtract(json_path=json_path)) @@ -231,6 +242,37 @@ def json_extract_string_array( return array_series +def json_query( + input: series.Series, + json_path: str, +) -> series.Series: + """Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING`` + or ``JSON`` value. This function uses double quotes to escape invalid JSONPath + characters in JSON keys. For example: ``"a.b"``. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) + >>> bbq.json_query(s, json_path="$.class") + 0 {"students":[{"id":5},{"id":12}]} + dtype: string + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return input._apply_unary_op(ops.JSONQuery(json_path=json_path)) + + def json_value( input: series.Series, json_path: str, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 228c866e1a..7707f16dad 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1356,6 +1356,19 @@ def json_extract_string_array_op_impl( return json_extract_string_array(json_obj=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONQuery, pass_op=True) +def json_query_op_impl(x: ibis_types.Value, op: ops.JSONQuery): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query.__annotations__["return"] = return_type + json_query_op = ibis_udf.scalar.builtin(json_query) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + @scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): return parse_json(json_str=x) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e5da674a8c..3e97ec6f4a 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -106,6 +106,7 @@ JSONExtract, JSONExtractArray, JSONExtractStringArray, + JSONQuery, JSONSet, JSONValue, ParseJSON, @@ -355,6 +356,7 @@ "JSONExtract", "JSONExtractArray", "JSONExtractStringArray", + "JSONQuery", "JSONSet", "JSONValue", "ParseJSON", diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index c9ce633cae..b083035d38 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -134,3 +134,18 @@ def output_type(self, *input_types): + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE + + +@dataclasses.dataclass(frozen=True) +class JSONQuery(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_query" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return input_type diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index b9d21f226a..3d155b5f16 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -99,7 +99,8 @@ def test_json_extract_from_json(): ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract(s, "$.a.b") + with pytest.warns(UserWarning, match="The `json_extract` is deprecated"): + actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) @@ -212,6 +213,34 @@ def test_json_extract_string_array_w_invalid_series_type(): bbq.json_extract_string_array(s) +def test_json_query_from_json(): + s = bpd.Series( + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=dtypes.JSON_DTYPE, + ) + actual = bbq.json_query(s, "$.a.b") + expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_from_string(): + s = bpd.Series( + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query(s, "$.a.b") + expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_w_invalid_series_type(): + s = bpd.Series([1, 2]) + with pytest.raises(TypeError): + bbq.json_query(s, "$.a") + + def test_json_value_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], From 66087350ef99eb1cf3166544f9bfbcb8bbd97178 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 28 May 2025 13:55:45 -0700 Subject: [PATCH 37/52] test: remove deprecated gemini 1.5 tests (#1776) --- tests/system/small/ml/test_llm.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 8bfdffc140..3d5453099d 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -158,12 +158,6 @@ def test_gemini_text_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -212,13 +206,9 @@ def test_gemini_text_generator_multi_cols_predict_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky(retries=2) @@ -738,8 +728,6 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-002", - "gemini-1.5-flash-002", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), @@ -768,8 +756,6 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name): @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-002", - "gemini-1.5-flash-002", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), @@ -797,11 +783,7 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) @pytest.mark.parametrize( "model_name", - ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-2.0-flash-exp", - ), + ("gemini-2.0-flash-exp",), ) def test_gemini_preview_model_warnings(model_name): with pytest.warns(exceptions.PreviewWarning): From e5fe14339b4a40ab4a25657ee0453e4108cf8bba Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 28 May 2025 14:37:47 -0700 Subject: [PATCH 38/52] fix: Fix error with self-merge operations (#1774) --- bigframes/core/blocks.py | 2 +- tests/system/small/test_dataframe.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a3a2ac36f5..35cb7d41ae 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2166,7 +2166,7 @@ def merge( result_columns.append(get_column_left[col_id]) for col_id in other.value_columns: if col_id in right_join_ids: - if other.col_id_to_label[matching_right_id] in matching_join_labels: + if other.col_id_to_label[col_id] in matching_join_labels: pass else: result_columns.append(get_column_right[col_id]) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 596b9b17f1..fa451da35f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1635,6 +1635,29 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): ) +def test_self_merge_self_w_on_args(): + data = { + "A": pd.Series([1, 2, 3], dtype="Int64"), + "B": pd.Series([1, 2, 3], dtype="Int64"), + "C": pd.Series([100, 200, 300], dtype="Int64"), + "D": pd.Series(["alpha", "beta", "gamma"], dtype="string[pyarrow]"), + } + df = pd.DataFrame(data) + + df1 = df[["A", "C"]] + df2 = df[["B", "C", "D"]] + pd_result = df1.merge(df2, left_on=["A", "C"], right_on=["B", "C"], how="inner") + + bf_df = bpd.DataFrame(data) + + bf_df1 = bf_df[["A", "C"]] + bf_df2 = bf_df[["B", "C", "D"]] + bf_result = bf_df1.merge( + bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" + ).to_pandas() + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + @pytest.mark.parametrize( ("decimals",), [ From f9c29c85053d8111a74ce382490daed36f8bb35b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 28 May 2025 16:56:15 -0700 Subject: [PATCH 39/52] feat: support dict param for dataframe.agg() (#1772) * feat: support dict param for dataframe.agg() * fix lint * add more tests * fix lint --- bigframes/dataframe.py | 19 +++++++++++++++++-- tests/system/small/test_dataframe.py | 26 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index cba635062f..6c3ac7537b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2925,9 +2925,23 @@ def nunique(self) -> bigframes.series.Series: return bigframes.series.Series(block) def agg( - self, func: str | typing.Sequence[str] + self, + func: str + | typing.Sequence[str] + | typing.Mapping[blocks.Label, typing.Sequence[str] | str], ) -> DataFrame | bigframes.series.Series: - if utils.is_list_like(func): + if utils.is_dict_like(func): + # Must check dict-like first because dictionaries are list-like + # according to Pandas. + agg_cols = [] + for col_label, agg_func in func.items(): + agg_cols.append(self[col_label].agg(agg_func)) + + from bigframes.core.reshape import api as reshape + + return reshape.concat(agg_cols, axis=1) + + elif utils.is_list_like(func): aggregations = [agg_ops.lookup_agg_func(f) for f in func] for dtype, agg in itertools.product(self.dtypes, aggregations): @@ -2941,6 +2955,7 @@ def agg( aggregations, ) ) + else: return bigframes.series.Series( self._block.aggregate_all_and_stack( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index fa451da35f..c80ced45a5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5652,3 +5652,29 @@ def test_astype_invalid_type_fail(scalars_dfs): with pytest.raises(TypeError, match=r".*Share your usecase with.*"): bf_df.astype(123) + + +def test_agg_with_dict(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": ["min", "count"], + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): + bf_df, _ = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "nonexisting_col": ["count"], + } + + with pytest.raises(KeyError): + bf_df.agg(agg_funcs) From 8e71b03e8d0e6648aea77b8648ae4eb4c4954ca2 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 28 May 2025 22:41:20 -0700 Subject: [PATCH 40/52] refactor: Cache dtypes for scalar expressions for SQLGlot compiler (#1759) * feat: include bq schema and query string in dry run results * rename key * fix tests * refactor: cache dtypes for scalar expressions: * fix deref expr type resolution bug * add test * remove dry_run changes from another branch * remove more changes from dry_run PR * rename DeferredDtype to AbsentDtype * removed absentDtype and reuse bind_refs * use a separate resolver for fields * fix lint * move field resolutions to a separate function * update helper function name * update doc and function names * bind schema at compile time for SQLGlot compiler * define a separate expression for field reference --- bigframes/core/bigframe_node.py | 23 +-- bigframes/core/compile/sqlglot/compiler.py | 3 +- .../core/compile/sqlglot/scalar_compiler.py | 7 + bigframes/core/expression.py | 163 ++++++++++++++---- bigframes/core/field.py | 37 ++++ bigframes/core/nodes.py | 24 +-- bigframes/core/rewrite/schema_binding.py | 49 ++++++ tests/unit/core/test_expression.py | 96 +++++++++-- 8 files changed, 320 insertions(+), 82 deletions(-) create mode 100644 bigframes/core/field.py create mode 100644 bigframes/core/rewrite/schema_binding.py diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py index 5509adc0ea..45e3c40701 100644 --- a/bigframes/core/bigframe_node.py +++ b/bigframes/core/bigframe_node.py @@ -22,7 +22,7 @@ import typing from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Set, Tuple -from bigframes.core import identifiers +from bigframes.core import field, identifiers import bigframes.core.schema as schemata import bigframes.dtypes @@ -34,23 +34,6 @@ T = typing.TypeVar("T") -@dataclasses.dataclass(frozen=True) -class Field: - id: identifiers.ColumnId - dtype: bigframes.dtypes.Dtype - # Best effort, nullable=True if not certain - nullable: bool = True - - def with_nullable(self) -> Field: - return Field(self.id, self.dtype, nullable=True) - - def with_nonnull(self) -> Field: - return Field(self.id, self.dtype, nullable=False) - - def with_id(self, id: identifiers.ColumnId) -> Field: - return Field(id, self.dtype, nullable=self.nullable) - - @dataclasses.dataclass(eq=False, frozen=True) class BigFrameNode: """ @@ -162,7 +145,7 @@ def roots(self) -> typing.Set[BigFrameNode]: # TODO: Store some local data lazily for select, aggregate nodes. @property @abc.abstractmethod - def fields(self) -> Sequence[Field]: + def fields(self) -> Sequence[field.Field]: ... @property @@ -292,7 +275,7 @@ def _dtype_lookup(self) -> dict[identifiers.ColumnId, bigframes.dtypes.Dtype]: return {field.id: field.dtype for field in self.fields} @functools.cached_property - def field_by_id(self) -> Mapping[identifiers.ColumnId, Field]: + def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]: return {field.id: field for field in self.fields} # Plan algorithms diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 953ebf34fd..1cb270297c 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -26,6 +26,7 @@ import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.core.ordering as bf_ordering +from bigframes.core.rewrite import schema_binding class SQLGlotCompiler: @@ -183,6 +184,6 @@ def compile_projection( def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) - node = nodes.bottom_up(node, rewrite.rewrite_timedelta_expressions) + node = nodes.bottom_up(node, schema_binding.bind_schema_to_expressions) node = nodes.bottom_up(node, rewrite.rewrite_range_rolling) return node diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 48aa4c7b0b..0f059d482c 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -35,6 +35,13 @@ def compile_deref_expression(expr: expression.DerefOp) -> sge.Expression: return sge.ColumnDef(this=sge.to_identifier(expr.id.sql, quoted=True)) +@compile_scalar_expression.register +def compile_field_ref_expression( + expr: expression.SchemaFieldRefExpression, +) -> sge.Expression: + return sge.ColumnDef(this=sge.to_identifier(expr.field.id.sql, quoted=True)) + + @compile_scalar_expression.register def compile_constant_expression( expr: expression.ScalarConstantExpression, diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index afd290827d..238b588fea 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -16,14 +16,16 @@ import abc import dataclasses +import functools import itertools import typing from typing import Generator, Mapping, TypeVar, Union import pandas as pd +from bigframes import dtypes +from bigframes.core import field import bigframes.core.identifiers as ids -import bigframes.dtypes as dtypes import bigframes.operations import bigframes.operations.aggregations as agg_ops @@ -50,7 +52,7 @@ class Aggregation(abc.ABC): @abc.abstractmethod def output_type( - self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: ... @@ -72,7 +74,7 @@ class NullaryAggregation(Aggregation): op: agg_ops.NullaryWindowOp = dataclasses.field() def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: return self.op.output_type() @@ -86,13 +88,17 @@ def remap_column_refs( @dataclasses.dataclass(frozen=True) class UnaryAggregation(Aggregation): - op: agg_ops.UnaryWindowOp = dataclasses.field() - arg: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() + op: agg_ops.UnaryWindowOp + arg: Union[DerefOp, ScalarConstantExpression] def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: - return self.op.output_type(self.arg.output_type(input_types)) + # TODO(b/419300717) Remove resolutions once defers are cleaned up. + resolved_expr = bind_schema_fields(self.arg, input_fields) + assert resolved_expr.is_resolved + + return self.op.output_type(resolved_expr.output_type) @property def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: @@ -118,10 +124,16 @@ class BinaryAggregation(Aggregation): right: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: + # TODO(b/419300717) Remove resolutions once defers are cleaned up. + left_resolved_expr = bind_schema_fields(self.left, input_fields) + assert left_resolved_expr.is_resolved + right_resolved_expr = bind_schema_fields(self.right, input_fields) + assert right_resolved_expr.is_resolved + return self.op.output_type( - self.left.output_type(input_types), self.right.output_type(input_types) + left_resolved_expr.output_type, left_resolved_expr.output_type ) @property @@ -189,10 +201,17 @@ def remap_column_refs( def is_const(self) -> bool: ... + @property @abc.abstractmethod - def output_type( - self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] - ) -> dtypes.ExpressionType: + def is_resolved(self) -> bool: + """ + Returns true if and only if the expression's output type and nullability is available. + """ + ... + + @property + @abc.abstractmethod + def output_type(self) -> dtypes.ExpressionType: ... @abc.abstractmethod @@ -256,9 +275,12 @@ def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: def nullable(self) -> bool: return pd.isna(self.value) # type: ignore - def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] - ) -> dtypes.ExpressionType: + @property + def is_resolved(self) -> bool: + return True + + @property + def output_type(self) -> dtypes.ExpressionType: return self.dtype def bind_variables( @@ -308,9 +330,12 @@ def is_const(self) -> bool: def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: return () - def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] - ) -> dtypes.ExpressionType: + @property + def is_resolved(self): + return False + + @property + def output_type(self) -> dtypes.ExpressionType: raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_refs( @@ -340,7 +365,7 @@ def is_identity(self) -> bool: @dataclasses.dataclass(frozen=True) class DerefOp(Expression): - """A variable expression representing an unbound variable.""" + """An expression that refers to a column by ID.""" id: ids.ColumnId @@ -357,13 +382,13 @@ def nullable(self) -> bool: # Safe default, need to actually bind input schema to determine return True - def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] - ) -> dtypes.ExpressionType: - if self.id in input_types: - return input_types[self.id] - else: - raise ValueError(f"Type of variable {self.id} has not been fixed.") + @property + def is_resolved(self) -> bool: + return False + + @property + def output_type(self) -> dtypes.ExpressionType: + raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_variables( self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False @@ -390,6 +415,55 @@ def is_identity(self) -> bool: return True +@dataclasses.dataclass(frozen=True) +class SchemaFieldRefExpression(Expression): + """An expression representing a schema field. This is essentially a DerefOp with input schema bound.""" + + field: field.Field + + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return (self.field.id,) + + @property + def is_const(self) -> bool: + return False + + @property + def nullable(self) -> bool: + return self.field.nullable + + @property + def is_resolved(self) -> bool: + return True + + @property + def output_type(self) -> dtypes.ExpressionType: + return self.field.dtype + + def bind_variables( + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + ) -> Expression: + return self + + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, + ) -> Expression: + if self.field.id in bindings.keys(): + return bindings[self.field.id] + return self + + @property + def is_bijective(self) -> bool: + return True + + @property + def is_identity(self) -> bool: + return True + + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): """An expression representing a scalar operation applied to 1 or more argument sub-expressions.""" @@ -429,13 +503,18 @@ def nullable(self) -> bool: ) return not null_free - def output_type( - self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] - ) -> dtypes.ExpressionType: - operand_types = tuple( - map(lambda x: x.output_type(input_types=input_types), self.inputs) - ) - return self.op.output_type(*operand_types) + @functools.cached_property + def is_resolved(self) -> bool: + return all(input.is_resolved for input in self.inputs) + + @functools.cached_property + def output_type(self) -> dtypes.ExpressionType: + if not self.is_resolved: + raise ValueError(f"Type of expression {self.op.name} has not been fixed.") + + input_types = [input.output_type for input in self.inputs] + + return self.op.output_type(*input_types) def bind_variables( self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False @@ -475,4 +554,22 @@ def deterministic(self) -> bool: ) +def bind_schema_fields( + expr: Expression, field_by_id: Mapping[ids.ColumnId, field.Field] +) -> Expression: + """ + Updates `DerefOp` expressions by replacing column IDs with actual schema fields(columns). + + We can only deduct an expression's output type and nullability after binding schema fields to + all its deref expressions. + """ + if expr.is_resolved: + return expr + + expr_by_id = { + id: SchemaFieldRefExpression(field) for id, field in field_by_id.items() + } + return expr.bind_refs(expr_by_id) + + RefOrConstant = Union[DerefOp, ScalarConstantExpression] diff --git a/bigframes/core/field.py b/bigframes/core/field.py new file mode 100644 index 0000000000..c5b7dd3555 --- /dev/null +++ b/bigframes/core/field.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses + +from bigframes import dtypes +from bigframes.core import identifiers + + +@dataclasses.dataclass(frozen=True) +class Field: + id: identifiers.ColumnId + dtype: dtypes.Dtype + # Best effort, nullable=True if not certain + nullable: bool = True + + def with_nullable(self) -> Field: + return Field(self.id, self.dtype, nullable=True) + + def with_nonnull(self) -> Field: + return Field(self.id, self.dtype, nullable=False) + + def with_id(self, id: identifiers.ColumnId) -> Field: + return Field(id, self.dtype, nullable=self.nullable) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 3e4bdb57c4..cc82c844f7 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -34,8 +34,9 @@ import google.cloud.bigquery as bq from bigframes.core import identifiers, local_data, sequences -from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field +from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET import bigframes.core.expression as ex +from bigframes.core.field import Field from bigframes.core.ordering import OrderingExpression, RowOrdering import bigframes.core.slices as slices import bigframes.core.window_spec as window @@ -1190,26 +1191,25 @@ class ProjectionNode(UnaryNode, AdditiveNode): assignments: typing.Tuple[typing.Tuple[ex.Expression, identifiers.ColumnId], ...] def _validate(self): - input_types = self.child._dtype_lookup - for expression, id in self.assignments: + for expression, _ in self.assignments: # throws TypeError if invalid - _ = expression.output_type(input_types) + _ = ex.bind_schema_fields(expression, self.child.field_by_id).output_type # Cannot assign to existing variables - append only! assert all(name not in self.child.schema.names for _, name in self.assignments) @functools.cached_property def added_fields(self) -> Tuple[Field, ...]: - input_types = self.child._dtype_lookup - fields = [] for expr, id in self.assignments: + bound_expr = ex.bind_schema_fields(expr, self.child.field_by_id) field = Field( id, - bigframes.dtypes.dtype_for_etype(expr.output_type(input_types)), - nullable=expr.nullable, + bigframes.dtypes.dtype_for_etype(bound_expr.output_type), + nullable=bound_expr.nullable, ) + # Special case until we get better nullability inference in expression objects themselves - if expr.is_identity and not any( + if bound_expr.is_identity and not any( self.child.field_by_id[id].nullable for id in expr.column_references ): field = field.with_nonnull() @@ -1300,7 +1300,7 @@ def fields(self) -> Sequence[Field]: Field( id, bigframes.dtypes.dtype_for_etype( - agg.output_type(self.child._dtype_lookup) + agg.output_type(self.child.field_by_id) ), nullable=True, ) @@ -1410,11 +1410,11 @@ def row_count(self) -> Optional[int]: @functools.cached_property def added_field(self) -> Field: - input_types = self.child._dtype_lookup + input_fields = self.child.field_by_id # TODO: Determine if output could be non-null return Field( self.output_name, - bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_types)), + bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_fields)), ) @property diff --git a/bigframes/core/rewrite/schema_binding.py b/bigframes/core/rewrite/schema_binding.py new file mode 100644 index 0000000000..f3c313233b --- /dev/null +++ b/bigframes/core/rewrite/schema_binding.py @@ -0,0 +1,49 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses + +from bigframes.core import bigframe_node +from bigframes.core import expression as ex +from bigframes.core import nodes + + +def bind_schema_to_expressions( + node: bigframe_node.BigFrameNode, +) -> bigframe_node.BigFrameNode: + if isinstance(node, nodes.ProjectionNode): + bound_assignments = tuple( + (ex.bind_schema_fields(expr, node.child.field_by_id), id) + for expr, id in node.assignments + ) + return dataclasses.replace(node, assignments=bound_assignments) + + if isinstance(node, nodes.FilterNode): + bound_predicate = ex.bind_schema_fields(node.predicate, node.child.field_by_id) + return dataclasses.replace(node, predicate=bound_predicate) + + if isinstance(node, nodes.OrderByNode): + bound_bys = [] + for by in node.by: + bound_by = dataclasses.replace( + by, + scalar_expression=ex.bind_schema_fields( + by.scalar_expression, node.child.field_by_id + ), + ) + bound_bys.append(bound_by) + + return dataclasses.replace(node, by=tuple(bound_bys)) + + return node diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py index ab6402a909..9534c8605a 100644 --- a/tests/unit/core/test_expression.py +++ b/tests/unit/core/test_expression.py @@ -12,43 +12,107 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + +import pytest + +from bigframes.core import field import bigframes.core.expression as ex import bigframes.core.identifiers as ids import bigframes.dtypes as dtypes import bigframes.operations as ops -def test_expression_dtype_simple(): +def test_simple_expression_dtype(): expression = ops.add_op.as_expr("a", "b") - result = expression.output_type( - {ids.ColumnId("a"): dtypes.INT_DTYPE, ids.ColumnId("b"): dtypes.INT_DTYPE} + field_bindings = _create_field_bindings( + {"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE} ) - assert result == dtypes.INT_DTYPE + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.INT_DTYPE) -def test_expression_dtype_nested(): +def test_nested_expression_dtype(): expression = ops.add_op.as_expr( "a", ops.abs_op.as_expr(ops.sub_op.as_expr("b", ex.const(3.14))) ) - - result = expression.output_type( - {ids.ColumnId("a"): dtypes.INT_DTYPE, ids.ColumnId("b"): dtypes.INT_DTYPE} + field_bindings = _create_field_bindings( + {"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE} ) - assert result == dtypes.FLOAT_DTYPE + result = ex.bind_schema_fields(expression, field_bindings) + _assert_output_type(result, dtypes.FLOAT_DTYPE) -def test_expression_dtype_where(): - expression = ops.where_op.as_expr(ex.const(3), ex.const(True), ex.const(None)) - result = expression.output_type({}) +def test_where_op_dtype(): + expression = ops.where_op.as_expr(ex.const(3), ex.const(True), ex.const(None)) - assert result == dtypes.INT_DTYPE + _assert_output_type(expression, dtypes.INT_DTYPE) -def test_expression_dtype_astype(): +def test_astype_op_dtype(): expression = ops.AsTypeOp(dtypes.INT_DTYPE).as_expr(ex.const(3.14159)) - result = expression.output_type({}) + _assert_output_type(expression, dtypes.INT_DTYPE) + + +def test_deref_op_dtype_unavailable(): + expression = ex.deref("mycol") + + assert not expression.is_resolved + with pytest.raises(ValueError): + expression.output_type + + +def test_deref_op_dtype_resolution(): + expression = ex.deref("mycol") + field_bindings = _create_field_bindings({"mycol": dtypes.STRING_DTYPE}) + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.STRING_DTYPE) + + +def test_field_ref_expr_dtype_resolution_short_circuit(): + expression = ex.SchemaFieldRefExpression( + field.Field(ids.ColumnId("mycol"), dtype=dtypes.INT_DTYPE) + ) + field_bindings = _create_field_bindings({"anotherCol": dtypes.STRING_DTYPE}) + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.INT_DTYPE) + + +def test_nested_expression_dtypes_are_cached(): + expression = ops.add_op.as_expr(ex.deref("left_col"), ex.deref("right_col")) + field_bindings = _create_field_bindings( + { + "right_col": dtypes.INT_DTYPE, + "left_col": dtypes.FLOAT_DTYPE, + } + ) + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.FLOAT_DTYPE) + assert isinstance(result, ex.OpExpression) + _assert_output_type(result.inputs[0], dtypes.FLOAT_DTYPE) + _assert_output_type(result.inputs[1], dtypes.INT_DTYPE) + + +def _create_field_bindings( + col_dtypes: typing.Dict[str, dtypes.Dtype] +) -> typing.Dict[ids.ColumnId, field.Field]: + return { + ids.ColumnId(col): field.Field(ids.ColumnId(col), dtype) + for col, dtype in col_dtypes.items() + } + - assert result == dtypes.INT_DTYPE +def _assert_output_type(expr: ex.Expression, dtype: dtypes.Dtype): + assert expr.is_resolved + assert expr.output_type == dtype From 0629cac7f9a9370a72c1ae25e014eb478a4c8c08 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 29 May 2025 10:54:47 -0700 Subject: [PATCH 41/52] fix: fix the default value for na_value for numpy conversions (#1766) --- bigframes/dataframe.py | 3 ++- bigframes/series.py | 3 ++- tests/system/small/test_dataframe_io.py | 11 +++++++++++ tests/system/small/test_series.py | 2 +- tests/system/small/test_series_io.py | 11 +++++++++++ third_party/bigframes_vendored/pandas/core/frame.py | 3 ++- third_party/bigframes_vendored/pandas/core/series.py | 8 +++++--- 7 files changed, 34 insertions(+), 7 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6c3ac7537b..be940a1e82 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -47,6 +47,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +from pandas.api import extensions as pd_ext import pandas.io.formats.format import pyarrow import tabulate @@ -4097,7 +4098,7 @@ def to_numpy( self, dtype=None, copy=False, - na_value=None, + na_value=pd_ext.no_default, *, allow_large_results=None, **kwargs, diff --git a/bigframes/series.py b/bigframes/series.py index 626cf2fc76..866f4d0a5d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -42,6 +42,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +from pandas.api import extensions as pd_ext import pandas.core.dtypes.common import pyarrow as pa import typing_extensions @@ -2109,7 +2110,7 @@ def to_numpy( self, dtype=None, copy=False, - na_value=None, + na_value=pd_ext.no_default, *, allow_large_results=None, **kwargs, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fac3e9f4b8..5df7283e3c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -15,6 +15,8 @@ from typing import Tuple import google.api_core.exceptions +import numpy +import numpy.testing import pandas as pd import pandas.testing import pyarrow as pa @@ -1061,3 +1063,12 @@ def test_to_sql_query_named_index_excluded( utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) + + +def test_to_numpy(scalars_dfs): + bf_df, pd_df = scalars_dfs + + bf_result = numpy.array(bf_df[["int64_too"]], dtype="int64") + pd_result = numpy.array(pd_df[["int64_too"]], dtype="int64") + + numpy.testing.assert_array_equal(bf_result, pd_result) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index b4c24e4ba9..710e1481be 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2132,7 +2132,7 @@ def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_na ], ) def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): - bf_uniq = scalars_df_index[col_name].unique().to_numpy() + bf_uniq = scalars_df_index[col_name].unique().to_numpy(na_value=None) pd_uniq = scalars_pandas_df_index[col_name].unique() numpy.array_equal(pd_uniq, bf_uniq) diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index 5390d65268..426679d37d 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import numpy +import numpy.testing import pandas as pd import pytest @@ -114,3 +116,12 @@ def test_to_pandas_batches(scalars_dfs, page_size, max_results, allow_large_resu total_rows += actual_rows assert total_rows == expected_total_rows + + +def test_to_numpy(scalars_dfs): + bf_df, pd_df = scalars_dfs + + bf_result = numpy.array(bf_df["int64_too"], dtype="int64") + pd_result = numpy.array(pd_df["int64_too"], dtype="int64") + + numpy.testing.assert_array_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 9bb25cb5a4..63142e4dd8 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -17,6 +17,7 @@ import bigframes_vendored.pandas.core.generic as generic import numpy as np import pandas as pd +from pandas.api import extensions as pd_ext # ----------------------------------------------------------------------- # DataFrame class @@ -369,7 +370,7 @@ def to_numpy( self, dtype=None, copy=False, - na_value=None, + na_value=pd_ext.no_default, *, allow_large_results=None, **kwargs, diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8164fa7415..673a6f362f 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -19,8 +19,8 @@ from bigframes_vendored.pandas.core.generic import NDFrame import numpy import numpy as np -from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer +from pandas.api import extensions as pd_ext from bigframes import constants @@ -323,7 +323,7 @@ def reset_index( self, *, drop: bool = False, - name=lib.no_default, + name=pd_ext.no_default, ) -> DataFrame | Series | None: """ Generate a new DataFrame or Series with the index reset. @@ -730,7 +730,9 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: to_list = tolist - def to_numpy(self, dtype, copy=False, na_value=None, *, allow_large_results=None): + def to_numpy( + self, dtype, copy=False, na_value=pd_ext.no_default, *, allow_large_results=None + ): """ A NumPy ndarray representing the values in this Series or Index. From cbb5f14d4fd637d0ce734f54eceb5280dbb9101f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 29 May 2025 12:47:05 -0700 Subject: [PATCH 42/52] refactor: simplify the plugin detection code, add test coverage (#1769) We added instrumentation of the GCP vscode extension and jupyter plugin in pandas-gbq and bigquery-magics. In this change we are propagating the improvements there to BigFrames. --- bigframes/session/environment.py | 55 ++++++++++++------------- tests/unit/session/test_clients.py | 66 ++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 28 deletions(-) diff --git a/bigframes/session/environment.py b/bigframes/session/environment.py index 3ed6ab98cd..940f8deed4 100644 --- a/bigframes/session/environment.py +++ b/bigframes/session/environment.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. + import importlib import json import os +import pathlib + +Path = pathlib.Path + # The identifier for GCP VS Code extension # https://cloud.google.com/code/docs/vscode/install @@ -29,40 +34,36 @@ def _is_vscode_extension_installed(extension_id: str) -> bool: """ Checks if a given Visual Studio Code extension is installed. - Args: extension_id: The ID of the extension (e.g., "ms-python.python"). - Returns: True if the extension is installed, False otherwise. """ try: # Determine the user's VS Code extensions directory. - user_home = os.path.expanduser("~") - if os.name == "nt": # Windows - vscode_extensions_dir = os.path.join(user_home, ".vscode", "extensions") - elif os.name == "posix": # macOS and Linux - vscode_extensions_dir = os.path.join(user_home, ".vscode", "extensions") - else: - raise OSError("Unsupported operating system.") + user_home = Path.home() + vscode_extensions_dir = user_home / ".vscode" / "extensions" # Check if the extensions directory exists. - if os.path.exists(vscode_extensions_dir): - # Iterate through the subdirectories in the extensions directory. - for item in os.listdir(vscode_extensions_dir): - item_path = os.path.join(vscode_extensions_dir, item) - if os.path.isdir(item_path) and item.startswith(extension_id + "-"): - # Check if the folder starts with the extension ID. - # Further check for manifest file, as a more robust check. - manifest_path = os.path.join(item_path, "package.json") - if os.path.exists(manifest_path): - try: - with open(manifest_path, "r", encoding="utf-8") as f: - json.load(f) - return True - except (FileNotFoundError, json.JSONDecodeError): - # Corrupted or incomplete extension, or manifest missing. - pass + if not vscode_extensions_dir.exists(): + return False + + # Iterate through the subdirectories in the extensions directory. + extension_dirs = filter( + lambda p: p.is_dir() and p.name.startswith(extension_id + "-"), + vscode_extensions_dir.iterdir(), + ) + for extension_dir in extension_dirs: + # As a more robust check, the manifest file must exist. + manifest_path = extension_dir / "package.json" + if not manifest_path.exists() or not manifest_path.is_file(): + continue + + # Finally, the manifest file must be a valid json + with open(manifest_path, "r", encoding="utf-8") as f: + json.load(f) + + return True except Exception: pass @@ -72,10 +73,8 @@ def _is_vscode_extension_installed(extension_id: str) -> bool: def _is_package_installed(package_name: str) -> bool: """ Checks if a Python package is installed. - Args: package_name: The name of the package to check (e.g., "requests", "numpy"). - Returns: True if the package is installed, False otherwise. """ diff --git a/tests/unit/session/test_clients.py b/tests/unit/session/test_clients.py index 5d577a52ed..6b0d8583a5 100644 --- a/tests/unit/session/test_clients.py +++ b/tests/unit/session/test_clients.py @@ -13,6 +13,8 @@ # limitations under the License. import os +import pathlib +import tempfile from typing import Optional import unittest.mock as mock @@ -155,6 +157,7 @@ def test_user_agent_not_in_vscode(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_wo_user_agent(provider, "vscode") + assert_clients_wo_user_agent(provider, "googlecloudtools.cloudcode") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") @@ -165,16 +168,48 @@ def test_user_agent_in_vscode(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_w_user_agent(provider, "vscode") + assert_clients_wo_user_agent(provider, "googlecloudtools.cloudcode") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") +@mock.patch.dict(os.environ, {"VSCODE_PID": "12345"}, clear=True) +def test_user_agent_in_vscode_w_extension(monkeypatch): + monkeypatch_client_constructors(monkeypatch) + + with tempfile.TemporaryDirectory() as tmpdir: + user_home = pathlib.Path(tmpdir) + extension_dir = ( + user_home / ".vscode" / "extensions" / "googlecloudtools.cloudcode-0.12" + ) + extension_config = extension_dir / "package.json" + + # originally extension config does not exist + assert not extension_config.exists() + + # simulate extension installation by creating extension config on disk + extension_dir.mkdir(parents=True) + with open(extension_config, "w") as f: + f.write("{}") + + with mock.patch("pathlib.Path.home", return_value=user_home): + provider = create_clients_provider() + assert_clients_w_user_agent(provider, "vscode") + assert_clients_w_user_agent(provider, "googlecloudtools.cloudcode") + + # We still need to include attribution to bigframes + assert_clients_w_user_agent( + provider, f"bigframes/{bigframes.version.__version__}" + ) + + @mock.patch.dict(os.environ, {}, clear=True) def test_user_agent_not_in_jupyter(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_wo_user_agent(provider, "jupyter") + assert_clients_wo_user_agent(provider, "bigquery_jupyter_plugin") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") @@ -185,6 +220,37 @@ def test_user_agent_in_jupyter(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_w_user_agent(provider, "jupyter") + assert_clients_wo_user_agent(provider, "bigquery_jupyter_plugin") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") + + +@mock.patch.dict(os.environ, {"JPY_PARENT_PID": "12345"}, clear=True) +def test_user_agent_in_jupyter_with_plugin(monkeypatch): + monkeypatch_client_constructors(monkeypatch) + + def custom_import_module_side_effect(name, package=None): + if name == "bigquery_jupyter_plugin": + return mock.MagicMock() + else: + import importlib + + return importlib.import_module(name, package) + + assert isinstance( + custom_import_module_side_effect("bigquery_jupyter_plugin"), mock.MagicMock + ) + assert custom_import_module_side_effect("bigframes") is bigframes + + with mock.patch( + "importlib.import_module", side_effect=custom_import_module_side_effect + ): + provider = create_clients_provider() + assert_clients_w_user_agent(provider, "jupyter") + assert_clients_w_user_agent(provider, "bigquery_jupyter_plugin") + + # We still need to include attribution to bigframes + assert_clients_w_user_agent( + provider, f"bigframes/{bigframes.version.__version__}" + ) From d6b7ab418b2afe7a165841164f60362757825e77 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 29 May 2025 16:27:40 -0700 Subject: [PATCH 43/52] chore: Multimodal tests to target at BQ prod (#1782) --- tests/system/conftest.py | 14 ++++++-------- tests/system/large/blob/test_function.py | 18 +++++++++--------- tests/system/small/blob/test_io.py | 14 +++++++------- tests/system/small/ml/test_multimodal_llm.py | 12 ++++++------ tests/system/small/test_session.py | 10 ---------- 5 files changed, 28 insertions(+), 40 deletions(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index fb7d8d4e32..824e774dbe 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -920,8 +920,8 @@ def llm_text_pandas_df(): @pytest.fixture(scope="session") -def llm_text_df(test_session, llm_text_pandas_df): - return test_session.read_pandas(llm_text_pandas_df) +def llm_text_df(session, llm_text_pandas_df): + return session.read_pandas(llm_text_pandas_df) @pytest.fixture(scope="session") @@ -1494,9 +1494,9 @@ def images_uris() -> list[str]: @pytest.fixture(scope="session") def images_mm_df( - images_uris, test_session: bigframes.Session, bq_connection: str + images_uris, session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: - blob_series = bpd.Series(images_uris, session=test_session).str.to_blob( + blob_series = bpd.Series(images_uris, session=session).str.to_blob( connection=bq_connection ) return blob_series.rename("blob_col").to_frame() @@ -1518,8 +1518,6 @@ def pdf_gcs_path() -> str: @pytest.fixture(scope="session") def pdf_mm_df( - pdf_gcs_path, test_session: bigframes.Session, bq_connection: str + pdf_gcs_path, session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: - return test_session.from_glob_path( - pdf_gcs_path, name="pdf", connection=bq_connection - ) + return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index c189d249a7..3ebded3d29 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -53,9 +53,9 @@ def images_output_uris(images_output_folder: str) -> list[str]: def test_blob_exif( bq_connection: str, - test_session: bigframes.Session, + session: bigframes.Session, ): - exif_image_df = test_session.from_glob_path( + exif_image_df = session.from_glob_path( "gs://bigframes_blob_test/images_exif/*", name="blob_col", connection=bq_connection, @@ -64,7 +64,7 @@ def test_blob_exif( actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection) expected = bpd.Series( ['{"ExifOffset": 47, "Make": "MyCamera"}'], - session=test_session, + session=session, dtype=dtypes.JSON_DTYPE, ) pd.testing.assert_series_equal( @@ -79,9 +79,9 @@ def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - test_session: bigframes.Session, + session: bigframes.Session, ): - series = bpd.Series(images_output_uris, session=test_session).str.to_blob( + series = bpd.Series(images_output_uris, session=session).str.to_blob( connection=bq_connection ) @@ -147,9 +147,9 @@ def test_blob_image_resize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - test_session: bigframes.Session, + session: bigframes.Session, ): - series = bpd.Series(images_output_uris, session=test_session).str.to_blob( + series = bpd.Series(images_output_uris, session=session).str.to_blob( connection=bq_connection ) @@ -217,9 +217,9 @@ def test_blob_image_normalize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - test_session: bigframes.Session, + session: bigframes.Session, ): - series = bpd.Series(images_output_uris, session=test_session).str.to_blob( + series = bpd.Series(images_output_uris, session=session).str.to_blob( connection=bq_connection ) diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 806dad71dc..d3b4c4faa0 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -19,9 +19,9 @@ def test_blob_create_from_uri_str( - bq_connection: str, test_session: bigframes.Session, images_uris + bq_connection: str, session: bigframes.Session, images_uris ): - uri_series = bpd.Series(images_uris, session=test_session) + uri_series = bpd.Series(images_uris, session=session) blob_series = uri_series.str.to_blob(connection=bq_connection) pd_blob_df = blob_series.struct.explode().to_pandas() @@ -40,9 +40,9 @@ def test_blob_create_from_uri_str( def test_blob_create_from_glob_path( - bq_connection: str, test_session: bigframes.Session, images_gcs_path, images_uris + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): - blob_df = test_session.from_glob_path( + blob_df = session.from_glob_path( images_gcs_path, connection=bq_connection, name="blob_col" ) pd_blob_df = ( @@ -68,11 +68,11 @@ def test_blob_create_from_glob_path( def test_blob_create_read_gbq_object_table( - bq_connection: str, test_session: bigframes.Session, images_gcs_path, images_uris + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): - obj_table = test_session._create_object_table(images_gcs_path, bq_connection) + obj_table = session._create_object_table(images_gcs_path, bq_connection) - blob_df = test_session.read_gbq_object_table(obj_table, name="blob_col") + blob_df = session.read_gbq_object_table(obj_table, name="blob_col") pd_blob_df = ( blob_df["blob_col"] .struct.explode() diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py index ba834906b2..beee95636f 100644 --- a/tests/system/small/ml/test_multimodal_llm.py +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -23,10 +23,10 @@ @pytest.mark.flaky(retries=2) def test_multimodal_embedding_generator_predict_default_params_success( - images_mm_df, test_session, bq_connection + images_mm_df, session, bq_connection ): text_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=test_session + connection_name=bq_connection, session=session ) df = text_embedding_model.predict(images_mm_df).to_pandas() utils.check_pandas_df_schema_and_index( @@ -48,10 +48,10 @@ def test_multimodal_embedding_generator_predict_default_params_success( ) @pytest.mark.flaky(retries=2) def test_gemini_text_generator_multimodal_input( - images_mm_df: bpd.DataFrame, model_name, test_session, bq_connection + images_mm_df: bpd.DataFrame, model_name, session, bq_connection ): gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=test_session + model_name=model_name, connection_name=bq_connection, session=session ) pd_df = gemini_text_generator_model.predict( images_mm_df, prompt=["Describe", images_mm_df["blob_col"]] @@ -73,10 +73,10 @@ def test_gemini_text_generator_multimodal_input( ) @pytest.mark.flaky(retries=2) def test_gemini_text_generator_multimodal_structured_output( - images_mm_df: bpd.DataFrame, model_name, test_session, bq_connection + images_mm_df: bpd.DataFrame, model_name, session, bq_connection ): gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=test_session + model_name=model_name, connection_name=bq_connection, session=session ) output_schema = { "bool_output": "bool", diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index dfb69d628e..6e68a759b4 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1814,16 +1814,6 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) -def test_read_gbq_test(test_session: bigframes.Session): - test_project_id = "bigframes-dev" - test_dataset_id = "test_env_only" - test_table_id = "one_table" - table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}" - actual = test_session.read_gbq(table_id).to_pandas() - - assert actual.shape == (1, 1) - - @pytest.mark.parametrize( ("query_or_table", "index_col", "columns"), [ From 2bc4fbc78eba4bb2ee335e0475700a7ca5bc84d7 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 29 May 2025 17:56:44 -0700 Subject: [PATCH 44/52] perf: Optimize repr for unordered gbq table (#1778) --- bigframes/core/rewrite/__init__.py | 3 +- bigframes/core/rewrite/slices.py | 8 +-- bigframes/core/tree_properties.py | 25 +++------ bigframes/session/loader.py | 5 ++ bigframes/session/read_api_execution.py | 24 +++++++-- .../small/session/test_read_gbq_colab.py | 52 +++++++++++++++++++ 6 files changed, 88 insertions(+), 29 deletions(-) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 069ebb9cdf..b8f1d26db8 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -22,7 +22,7 @@ try_reduce_to_local_scan, try_reduce_to_table_scan, ) -from bigframes.core.rewrite.slices import pull_up_limits, rewrite_slice +from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions from bigframes.core.rewrite.windows import rewrite_range_rolling @@ -32,6 +32,7 @@ "rewrite_slice", "rewrite_timedelta_expressions", "pull_up_limits", + "pull_out_limit", "remap_variables", "defer_order", "column_pruning", diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index b8a003e061..92911310da 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -26,7 +26,7 @@ def pull_up_limits(root: nodes.ResultNode) -> nodes.ResultNode: - new_child, pulled_limit = _pullup_slice_inner(root.child) + new_child, pulled_limit = pull_out_limit(root.child) if new_child == root.child: return root elif pulled_limit is None: @@ -37,7 +37,7 @@ def pull_up_limits(root: nodes.ResultNode) -> nodes.ResultNode: return dataclasses.replace(root, child=new_child, limit=new_limit) -def _pullup_slice_inner( +def pull_out_limit( root: nodes.BigFrameNode, ) -> Tuple[nodes.BigFrameNode, Optional[int]]: """ @@ -53,7 +53,7 @@ def _pullup_slice_inner( assert root.step == 1 assert root.stop is not None limit = root.stop - new_root, prior_limit = _pullup_slice_inner(root.child) + new_root, prior_limit = pull_out_limit(root.child) if (prior_limit is not None) and (prior_limit < limit): limit = prior_limit return new_root, limit @@ -61,7 +61,7 @@ def _pullup_slice_inner( isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode)) and root.row_preserving ): - new_child, prior_limit = _pullup_slice_inner(root.child) + new_child, prior_limit = pull_out_limit(root.child) if prior_limit is not None: return root.transform_children(lambda _: new_child), prior_limit # Most ops don't support pulling up slice, like filter, agg, join, etc. diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index 82df53af82..baf4b12566 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -45,26 +45,13 @@ def can_fast_head(node: nodes.BigFrameNode) -> bool: # To do fast head operation: # (1) the underlying data must be arranged/indexed according to the logical ordering # (2) transformations must support pushing down LIMIT or a filter on row numbers - return has_fast_offset_address(node) or has_fast_offset_address(node) - - -def has_fast_orderby_limit(node: nodes.BigFrameNode) -> bool: - """True iff ORDER BY LIMIT can be performed without a large full table scan.""" - # TODO: In theory compatible with some Slice nodes, potentially by adding OFFSET - if isinstance(node, nodes.LeafNode): - return node.fast_ordered_limit - if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): - return has_fast_orderby_limit(node.child) - return False - - -def has_fast_offset_address(node: nodes.BigFrameNode) -> bool: - """True iff specific offsets can be scanned without a large full table scan.""" - # TODO: In theory can push offset lookups through slice operators by translating indices - if isinstance(node, nodes.LeafNode): - return node.fast_offsets + if isinstance(node, nodes.ReadLocalNode): + # always cheap to push slice into local data + return True + if isinstance(node, nodes.ReadTableNode): + return (node.source.ordering is None) or (node.fast_ordered_limit) if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): - return has_fast_offset_address(node.child) + return can_fast_head(node.child) return False diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 8b0a1266ce..ba669a62bb 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -387,6 +387,7 @@ def read_gbq_table( # type: ignore[overload-overlap] enable_snapshot: bool = ..., dry_run: Literal[False] = ..., force_total_order: Optional[bool] = ..., + n_rows: Optional[int] = None, ) -> dataframe.DataFrame: ... @@ -408,6 +409,7 @@ def read_gbq_table( enable_snapshot: bool = ..., dry_run: Literal[True] = ..., force_total_order: Optional[bool] = ..., + n_rows: Optional[int] = None, ) -> pandas.Series: ... @@ -428,6 +430,7 @@ def read_gbq_table( enable_snapshot: bool = True, dry_run: bool = False, force_total_order: Optional[bool] = None, + n_rows: Optional[int] = None, ) -> dataframe.DataFrame | pandas.Series: import bigframes._tools.strings import bigframes.dataframe as dataframe @@ -618,6 +621,7 @@ def read_gbq_table( at_time=time_travel_timestamp if enable_snapshot else None, primary_key=primary_key, session=self._session, + n_rows=n_rows, ) # if we don't have a unique index, we order by row hash if we are in strict mode if ( @@ -852,6 +856,7 @@ def read_gbq_query( columns=columns, use_cache=configuration["query"]["useQueryCache"], force_total_order=force_total_order, + n_rows=query_job.result().total_rows, # max_results and filters are omitted because they are already # handled by to_query(), above. ) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index 9384a40fbe..d4bbf2783c 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -39,12 +39,17 @@ def execute( ordered: bool, peek: Optional[int] = None, ) -> Optional[executor.ExecuteResult]: - node = self._try_adapt_plan(plan, ordered) - if not node: + adapt_result = self._try_adapt_plan(plan, ordered) + if not adapt_result: return None + node, limit = adapt_result if node.explicitly_ordered and ordered: return None + if limit is not None: + if peek is None or limit < peek: + peek = limit + import google.cloud.bigquery_storage_v1.types as bq_storage_types from google.protobuf import timestamp_pb2 @@ -117,11 +122,20 @@ def _try_adapt_plan( self, plan: bigframe_node.BigFrameNode, ordered: bool, - ) -> Optional[nodes.ReadTableNode]: + ) -> Optional[tuple[nodes.ReadTableNode, Optional[int]]]: """ - Tries to simplify the plan to an equivalent single ReadTableNode. Otherwise, returns None. + Tries to simplify the plan to an equivalent single ReadTableNode and a limit. Otherwise, returns None. """ + plan, limit = rewrite.pull_out_limit(plan) + # bake_order does not allow slice ops + plan = plan.bottom_up(rewrite.rewrite_slice) if not ordered: # gets rid of order_by ops plan = rewrite.bake_order(plan) - return rewrite.try_reduce_to_table_scan(plan) + read_table_node = rewrite.try_reduce_to_table_scan(plan) + if read_table_node is None: + return None + if (limit is not None) and (read_table_node.source.ordering is not None): + # read api can only use physical ordering to limit, not a logical ordering + return None + return (read_table_node, limit) diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 946faffab2..a821901e4c 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -19,6 +19,7 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): + executions_before_sql = maybe_ordered_session._metrics.execution_count df = maybe_ordered_session._read_gbq_colab( """ SELECT @@ -32,9 +33,11 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi LIMIT 300 """ ) + executions_before_python = maybe_ordered_session._metrics.execution_count batches = df.to_pandas_batches( page_size=100, ) + executions_after = maybe_ordered_session._metrics.execution_count total_rows = 0 for batch in batches: @@ -42,6 +45,55 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi total_rows += len(batch.index) assert total_rows > 0 + assert executions_after == executions_before_python == executions_before_sql + 1 + + +def test_read_gbq_colab_peek_avoids_requery(maybe_ordered_session): + executions_before_sql = maybe_ordered_session._metrics.execution_count + df = maybe_ordered_session._read_gbq_colab( + """ + SELECT + name, + SUM(number) AS total + FROM + `bigquery-public-data.usa_names.usa_1910_2013` + WHERE state LIKE 'W%' + GROUP BY name + ORDER BY total DESC + LIMIT 300 + """ + ) + executions_before_python = maybe_ordered_session._metrics.execution_count + result = df.peek(100) + executions_after = maybe_ordered_session._metrics.execution_count + + # Ok, this isn't guaranteed by peek, but should happen with read api based impl + # if starts failing, maybe stopped using read api? + assert result["total"].is_monotonic_decreasing + + assert len(result) == 100 + assert executions_after == executions_before_python == executions_before_sql + 1 + + +def test_read_gbq_colab_repr_avoids_requery(maybe_ordered_session): + executions_before_sql = maybe_ordered_session._metrics.execution_count + df = maybe_ordered_session._read_gbq_colab( + """ + SELECT + name, + SUM(number) AS total + FROM + `bigquery-public-data.usa_names.usa_1910_2013` + WHERE state LIKE 'W%' + GROUP BY name + ORDER BY total DESC + LIMIT 300 + """ + ) + executions_before_python = maybe_ordered_session._metrics.execution_count + _ = repr(df) + executions_after = maybe_ordered_session._metrics.execution_count + assert executions_after == executions_before_python == executions_before_sql + 1 def test_read_gbq_colab_includes_formatted_scalars(session): From bd07e05d26820313c052eaf41c267a1ab20b4fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 30 May 2025 14:39:13 -0500 Subject: [PATCH 45/52] docs: fix typo for "population" in the `GeminiTextGenerator.predict(..., output_schema={...})` sample notebook (#1748) * docs: fix type in output_schema sample notebook * Update bq_dataframes_llm_output_schema.ipynb --- .../bq_dataframes_llm_output_schema.ipynb | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb index 0efac1eee3..04ea0571df 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb @@ -361,7 +361,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can also get float or int values, for example, to get polulations in millions:" + "You can also get float or int values, for example, to get populations in millions:" ] }, { @@ -400,7 +400,7 @@ " \n", " \n", " city\n", - " polulation_million\n", + " population_in_millions\n", " \n", " \n", " \n", @@ -425,7 +425,7 @@ "[3 rows x 2 columns in total]" ], "text/plain": [ - " city polulation_million\n", + " city population_in_millions\n", "0 Seattle 0.75\n", "1 New York 19.68\n", "2 Shanghai 26.32\n", @@ -439,8 +439,8 @@ } ], "source": [ - "result = gemini.predict(df, prompt=[\"what is the population in millions of\", df[\"city\"]], output_schema={\"polulation_million\": \"float64\"})\n", - "result[[\"city\", \"polulation_million\"]]" + "result = gemini.predict(df, prompt=[\"what is the population in millions of\", df[\"city\"]], output_schema={\"population_in_millions\": \"float64\"})\n", + "result[[\"city\", \"population_in_millions\"]]" ] }, { @@ -576,7 +576,7 @@ " \n", " city\n", " is_US_city\n", - " polulation_in_millions\n", + " population_in_millions\n", " rainy_days_per_year\n", " \n", " \n", @@ -608,7 +608,7 @@ "[3 rows x 4 columns in total]" ], "text/plain": [ - " city is_US_city polulation_in_millions rainy_days_per_year\n", + " city is_US_city population_in_millions rainy_days_per_year\n", "0 Seattle True 0.75 152\n", "1 New York True 8.8 121\n", "2 Shanghai False 26.32 115\n", @@ -622,8 +622,8 @@ } ], "source": [ - "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"polulation_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\"})\n", - "result[[\"city\", \"is_US_city\", \"polulation_in_millions\", \"rainy_days_per_year\"]]" + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"population_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\"})\n", + "result[[\"city\", \"is_US_city\", \"population_in_millions\", \"rainy_days_per_year\"]]" ] }, { @@ -677,7 +677,7 @@ " \n", " city\n", " is_US_city\n", - " polulation_in_millions\n", + " population_in_millions\n", " rainy_days_per_year\n", " places_to_visit\n", " gps_coordinates\n", @@ -717,7 +717,7 @@ "[3 rows x 6 columns in total]" ], "text/plain": [ - " city is_US_city polulation_in_millions rainy_days_per_year \\\n", + " city is_US_city population_in_millions rainy_days_per_year \\\n", "0 Seattle True 0.74 150 \n", "1 New York True 8.4 121 \n", "2 Shanghai False 26.32 115 \n", @@ -741,8 +741,8 @@ } ], "source": [ - "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"polulation_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\", \"places_to_visit\": \"array\", \"gps_coordinates\": \"struct\"})\n", - "result[[\"city\", \"is_US_city\", \"polulation_in_millions\", \"rainy_days_per_year\", \"places_to_visit\", \"gps_coordinates\"]]" + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"population_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\", \"places_to_visit\": \"array\", \"gps_coordinates\": \"struct\"})\n", + "result[[\"city\", \"is_US_city\", \"population_in_millions\", \"rainy_days_per_year\", \"places_to_visit\", \"gps_coordinates\"]]" ] } ], From bb45db8afdffa1417f11c050d40d4ec6d15b8654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 30 May 2025 15:33:41 -0500 Subject: [PATCH 46/52] feat: add `bpd.options.bigquery.requests_transport_adapters` option (#1755) * feat: add `bpd.options.bigquery.requests_transport_adapters` option This allows for overriding requests-based settings such as the maximum connection pool size. * add unit test --- bigframes/_config/bigquery_options.py | 47 ++++++++++++++++++++- bigframes/pandas/io/api.py | 1 + bigframes/session/__init__.py | 1 + bigframes/session/clients.py | 20 ++++++++- tests/system/small/test_pandas_options.py | 10 +++-- tests/unit/_config/test_bigquery_options.py | 1 + tests/unit/session/test_clients.py | 28 +++++++++--- 7 files changed, 95 insertions(+), 13 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 3a6008eaa8..d591ea85b3 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -16,10 +16,11 @@ from __future__ import annotations -from typing import Literal, Optional +from typing import Literal, Optional, Sequence, Tuple import warnings import google.auth.credentials +import requests.adapters import bigframes.enums import bigframes.exceptions as bfe @@ -90,6 +91,9 @@ def __init__( allow_large_results: bool = False, ordering_mode: Literal["strict", "partial"] = "strict", client_endpoints_override: Optional[dict] = None, + requests_transport_adapters: Sequence[ + Tuple[str, requests.adapters.BaseAdapter] + ] = (), ): self._credentials = credentials self._project = project @@ -100,6 +104,7 @@ def __init__( self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check self._allow_large_results = allow_large_results + self._requests_transport_adapters = requests_transport_adapters self._session_started = False # Determines the ordering strictness for the session. self._ordering_mode = _validate_ordering_mode(ordering_mode) @@ -379,3 +384,43 @@ def client_endpoints_override(self, value: dict): ) self._client_endpoints_override = value + + @property + def requests_transport_adapters( + self, + ) -> Sequence[Tuple[str, requests.adapters.BaseAdapter]]: + """Transport adapters for requests-based REST clients such as the + google-cloud-bigquery package. + + For more details, see the explanation in `requests guide to transport + adapters + `_. + + **Examples:** + + Increase the connection pool size using the requests `HTTPAdapter + `_. + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.requests_transport_adapters = ( + ... ("http://", requests.adapters.HTTPAdapter(pool_maxsize=100)), + ... ("https://", requests.adapters.HTTPAdapter(pool_maxsize=100)), + ... ) # doctest: +SKIP + + Returns: + Sequence[Tuple[str, requests.adapters.BaseAdapter]]: + Prefixes and corresponding transport adapters to `mount + `_ + in requests-based REST clients. + """ + return self._requests_transport_adapters + + @requests_transport_adapters.setter + def requests_transport_adapters( + self, value: Sequence[Tuple[str, requests.adapters.BaseAdapter]] + ) -> None: + if self._session_started and self._requests_transport_adapters != value: + raise ValueError( + SESSION_STARTED_MESSAGE.format(attribute="requests_transport_adapters") + ) + self._requests_transport_adapters = value diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index c09251de3b..b2ce5f211e 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -496,6 +496,7 @@ def _set_default_session_location_if_possible(query): application_name=config.options.bigquery.application_name, bq_kms_key_name=config.options.bigquery.kms_key_name, client_endpoints_override=config.options.bigquery.client_endpoints_override, + requests_transport_adapters=config.options.bigquery.requests_transport_adapters, ) bqclient = clients_provider.bqclient diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 46d71a079e..c24dca554a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -172,6 +172,7 @@ def __init__( application_name=context.application_name, bq_kms_key_name=self._bq_kms_key_name, client_endpoints_override=context.client_endpoints_override, + requests_transport_adapters=context.requests_transport_adapters, ) # TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494 diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 86312eb9ba..d680b94b8a 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -17,18 +17,20 @@ import os import threading import typing -from typing import Optional +from typing import Optional, Sequence, Tuple import google.api_core.client_info import google.api_core.client_options import google.api_core.gapic_v1.client_info import google.auth.credentials +import google.auth.transport.requests import google.cloud.bigquery as bigquery import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 import pydata_google_auth +import requests import bigframes.constants import bigframes.version @@ -79,6 +81,10 @@ def __init__( application_name: Optional[str] = None, bq_kms_key_name: Optional[str] = None, client_endpoints_override: dict = {}, + *, + requests_transport_adapters: Sequence[ + Tuple[str, requests.adapters.BaseAdapter] + ] = (), ): credentials_project = None if credentials is None: @@ -124,6 +130,7 @@ def __init__( ) self._location = location self._use_regional_endpoints = use_regional_endpoints + self._requests_transport_adapters = requests_transport_adapters self._credentials = credentials self._bq_kms_key_name = bq_kms_key_name @@ -173,12 +180,21 @@ def _create_bigquery_client(self): user_agent=self._application_name ) + requests_session = google.auth.transport.requests.AuthorizedSession( + self._credentials + ) + for prefix, adapter in self._requests_transport_adapters: + requests_session.mount(prefix, adapter) + bq_client = bigquery.Client( client_info=bq_info, client_options=bq_options, - credentials=self._credentials, project=self._project, location=self._location, + # Instead of credentials, use _http so that users can override + # requests options with transport adapters. See internal issue + # b/419106112. + _http=requests_session, ) # If a new enough client library is available, we opt-in to the faster diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index d59b6d66b5..55e5036a42 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -279,16 +279,18 @@ def test_credentials_need_reauthentication( # Call get_global_session() *after* read_gbq so that our location detection # has a chance to work. session = bpd.get_global_session() - assert session.bqclient._credentials.valid + assert session.bqclient._http.credentials.valid with monkeypatch.context() as m: # Simulate expired credentials to trigger the credential refresh flow - m.setattr(session.bqclient._credentials, "expiry", datetime.datetime.utcnow()) - assert not session.bqclient._credentials.valid + m.setattr( + session.bqclient._http.credentials, "expiry", datetime.datetime.utcnow() + ) + assert not session.bqclient._http.credentials.valid # Simulate an exception during the credential refresh flow m.setattr( - session.bqclient._credentials, + session.bqclient._http.credentials, "refresh", mock.Mock(side_effect=google.auth.exceptions.RefreshError()), ) diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index b8f3a612d4..686499aa75 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -38,6 +38,7 @@ ("skip_bq_connection_check", False, True), ("client_endpoints_override", {}, {"bqclient": "endpoint_address"}), ("ordering_mode", "strict", "partial"), + ("requests_transport_adapters", object(), object()), ], ) def test_setter_raises_if_session_started(attribute, original_value, new_value): diff --git a/tests/unit/session/test_clients.py b/tests/unit/session/test_clients.py index 6b0d8583a5..5304c99466 100644 --- a/tests/unit/session/test_clients.py +++ b/tests/unit/session/test_clients.py @@ -15,25 +15,22 @@ import os import pathlib import tempfile -from typing import Optional +from typing import cast, Optional import unittest.mock as mock -import google.api_core.client_info -import google.api_core.client_options -import google.api_core.exceptions -import google.api_core.gapic_v1.client_info import google.auth.credentials import google.cloud.bigquery import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 +import requests.adapters import bigframes.session.clients as clients import bigframes.version -def create_clients_provider(application_name: Optional[str] = None): +def create_clients_provider(application_name: Optional[str] = None, **kwargs): credentials = mock.create_autospec(google.auth.credentials.Credentials) return clients.ClientsProvider( project="test-project", @@ -42,6 +39,7 @@ def create_clients_provider(application_name: Optional[str] = None): credentials=credentials, application_name=application_name, bq_kms_key_name="projects/my-project/locations/us/keyRings/myKeyRing/cryptoKeys/myKey", + **kwargs, ) @@ -136,6 +134,24 @@ def assert_clients_wo_user_agent( ) +def test_requests_transport_adapters_pool_maxsize(monkeypatch): + monkeypatch_client_constructors(monkeypatch) + requests_transport_adapters = ( + ("http://", requests.adapters.HTTPAdapter(pool_maxsize=123)), + ("https://", requests.adapters.HTTPAdapter(pool_maxsize=123)), + ) # doctest: +SKIP + provider = create_clients_provider( + requests_transport_adapters=requests_transport_adapters + ) + + _, kwargs = cast(mock.Mock, provider.bqclient).call_args + requests_session = kwargs.get("_http") + adapter: requests.adapters.HTTPAdapter = requests_session.get_adapter( + "https://bigquery.googleapis.com/" + ) + assert adapter._pool_maxsize == 123 # type: ignore + + def test_user_agent_default(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider(application_name=None) From 9247535e53e7270806ca8e05c0870b09531a1fec Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 30 May 2025 13:46:56 -0700 Subject: [PATCH 47/52] chore: Finish factoring unit tests from presubmit (#1492) * chore: Finish factoring unit tests from presubmit * adjust cover requirement to 83 * add back test directory coverage * ignore some non-prod directories for coverage * amend omit arg in cover to work properly * reduce cov threshold to 84 --- noxfile.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/noxfile.py b/noxfile.py index 2fd437f469..297e8f9d6f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -119,10 +119,10 @@ # Sessions are executed in the order so putting the smaller sessions # ahead to fail fast at presubmit running. nox.options.sessions = [ - "unit", "system-3.9", "system-3.12", "cover", + # TODO(b/401609005): remove "cleanup", ] @@ -471,20 +471,31 @@ def cover(session): session.install("coverage", "pytest-cov") # Create a coverage report that includes only the product code. + omitted_paths = [ + # non-prod, unit tested + "bigframes/core/compile/polars/*", + "bigframes/core/compile/sqlglot/*", + # untested + "bigframes/streaming/*", + # utils + "bigframes/testing/*", + ] + session.run( "coverage", "report", "--include=bigframes/*", + # Only unit tested + f"--omit={','.join(omitted_paths)}", "--show-missing", - "--fail-under=85", + "--fail-under=84", ) - # Make sure there is no dead code in our test directories. + # Make sure there is no dead code in our system test directories. session.run( "coverage", "report", "--show-missing", - "--include=tests/unit/*", "--include=tests/system/small/*", # TODO(b/353775058) resume coverage to 100 when the issue is fixed. "--fail-under=99", From acba0321cafeb49f3e560a364ebbf3d15fb8af88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 30 May 2025 16:14:56 -0500 Subject: [PATCH 48/52] fix: include location in Session-based temporary storage manager DDL queries (#1780) * fix: include location in Session-based temporary storage manager DDL queries * fix indentation error * set location in read session, too * add missing location parent * remove broken read session parent --- bigframes/session/bigquery_session.py | 9 +++-- tests/system/large/test_location.py | 52 +++++++++++++-------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index ae8dc88d43..883087df07 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -84,7 +84,9 @@ def create_temp_table( ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" - job = self.bqclient.query(ddl, job_config=job_config) + job = self.bqclient.query( + ddl, job_config=job_config, location=self.location + ) job.result() # return the fully qualified table, so it can be used outside of the session return job.destination @@ -94,7 +96,10 @@ def close(self): self._sessiondaemon.stop() if self._session_id is not None and self.bqclient is not None: - self.bqclient.query_and_wait(f"CALL BQ.ABORT_SESSION('{self._session_id}')") + self.bqclient.query_and_wait( + f"CALL BQ.ABORT_SESSION('{self._session_id}')", + location=self.location, + ) def _get_session_id(self) -> str: if self._session_id: diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index d4428c1f95..3ebe2bb040 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -14,8 +14,6 @@ import typing -from google.cloud import bigquery -from google.cloud.bigquery_storage import types as bqstorage_types import pandas import pandas.testing import pytest @@ -41,7 +39,15 @@ def _assert_bq_execution_location( if expected_location is None: expected_location = session._location - assert typing.cast(bigquery.QueryJob, df.query_job).location == expected_location + query_job = df.query_job + assert query_job is not None + assert query_job.location == expected_location + destination = query_job.destination + assert destination is not None + destination_dataset = session.bqclient.get_dataset( + f"{destination.project}.{destination.dataset_id}" + ) + assert destination_dataset.location == expected_location # Ensure operation involving BQ client suceeds result = ( @@ -52,38 +58,28 @@ def _assert_bq_execution_location( .head() ) - assert ( - typing.cast(bigquery.QueryJob, result.query_job).location == expected_location + # Use allow_large_results = True to force a job to be created. + result_pd = result.to_pandas(allow_large_results=True) + + query_job = df.query_job + assert query_job is not None + assert query_job.location == expected_location + destination = query_job.destination + assert destination is not None + destination_dataset = session.bqclient.get_dataset( + f"{destination.project}.{destination.dataset_id}" ) + assert destination_dataset.location == expected_location expected_result = pandas.DataFrame( {"number": [444, 222]}, index=pandas.Index(["aaa", "bbb"], name="name") ) pandas.testing.assert_frame_equal( - expected_result, result.to_pandas(), check_dtype=False, check_index_type=False - ) - - # Ensure BQ Storage Read client operation succceeds - table = result.query_job.destination - requested_session = bqstorage_types.ReadSession( # type: ignore[attr-defined] - table=f"projects/{table.project}/datasets/{table.dataset_id}/tables/{table.table_id}", - data_format=bqstorage_types.DataFormat.ARROW, # type: ignore[attr-defined] - ) - read_session = session.bqstoragereadclient.create_read_session( - parent=f"projects/{table.project}", - read_session=requested_session, - max_stream_count=1, + expected_result, + result_pd, + check_dtype=False, + check_index_type=False, ) - reader = session.bqstoragereadclient.read_rows(read_session.streams[0].name) - frames = [] - for message in reader.rows().pages: - frames.append(message.to_dataframe()) - read_dataframe = pandas.concat(frames) - # normalize before comparing since we lost some of the bigframes column - # naming abtractions in the direct read of the destination table - read_dataframe = read_dataframe.set_index("name") - read_dataframe.columns = result.columns - pandas.testing.assert_frame_equal(expected_result, read_dataframe) def test_bq_location_default(): From 734cc652e435dc5d97a23411735aa51b7824e381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 30 May 2025 16:16:52 -0500 Subject: [PATCH 49/52] feat: support `inplace=True` in `rename` and `rename_axis` (#1744) * feat: support inplace=True in rename and rename_axis * fix typing issues * do not consider tuples single labels in rename --- bigframes/core/compile/polars/compiler.py | 5 + bigframes/core/indexes/base.py | 64 ++++++++++-- bigframes/dataframe.py | 63 +++++++++++- bigframes/series.py | 99 ++++++++++++++++--- bigframes/testing/mocks.py | 17 ++-- bigframes/testing/polars_session.py | 3 + tests/unit/test_dataframe.py | 39 ++++++++ tests/unit/test_index.py | 40 ++++++++ tests/unit/test_local_engine.py | 27 +++++ tests/unit/test_series.py | 37 +++++++ .../bigframes_vendored/pandas/core/frame.py | 23 +++-- .../pandas/core/indexes/base.py | 9 +- .../bigframes_vendored/pandas/core/series.py | 10 +- 13 files changed, 391 insertions(+), 45 deletions(-) create mode 100644 tests/unit/test_index.py diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index b2f018e80a..14d8e8501c 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -92,6 +92,8 @@ def _( return args[0] < args[1] if isinstance(op, ops.eq_op.__class__): return args[0] == args[1] + if isinstance(op, ops.ne_op.__class__): + return args[0] != args[1] if isinstance(op, ops.mod_op.__class__): return args[0] % args[1] if isinstance(op, ops.coalesce_op.__class__): @@ -101,6 +103,9 @@ def _( for pred, result in zip(args[2::2], args[3::2]): return expr.when(pred).then(result) return expr + if isinstance(op, ops.where_op.__class__): + original, condition, otherwise = args + return pl.when(condition).then(original).otherwise(otherwise) raise NotImplementedError(f"Polars compiler hasn't implemented {op}") @dataclasses.dataclass(frozen=True) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 6da68e2e8f..44b1d9d4fa 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -145,12 +145,7 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): - new_block = self._block.with_index_labels(values) - if self._linked_frame is not None: - self._linked_frame._set_block( - self._linked_frame._block.with_index_labels(values) - ) - self._block = new_block + self.rename(values, inplace=True) @property def nlevels(self) -> int: @@ -411,11 +406,62 @@ def fillna(self, value=None) -> Index: ops.fillna_op.as_expr(ex.free_var("arg"), ex.const(value)) ) - def rename(self, name: Union[str, Sequence[str]]) -> Index: - names = [name] if isinstance(name, str) else list(name) + @overload + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + ) -> Index: + ... + + @overload + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + *, + inplace: Literal[False], + ) -> Index: + ... + + @overload + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + *, + inplace: Literal[True], + ) -> None: + ... + + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + *, + inplace: bool = False, + ) -> Optional[Index]: + # Tuples are allowed as a label, but we specifically exclude them here. + # This is because tuples are hashable, but we want to treat them as a + # sequence. If name is iterable, we want to assume we're working with a + # MultiIndex. Unfortunately, strings are iterable and we don't want a + # list of all the characters, so specifically exclude the non-tuple + # hashables. + if isinstance(name, blocks.Label) and not isinstance(name, tuple): + names = [name] + else: + names = list(name) + if len(names) != self.nlevels: raise ValueError("'name' must be same length as levels") - return Index(self._block.with_index_labels(names)) + + new_block = self._block.with_index_labels(names) + + if inplace: + if self._linked_frame is not None: + self._linked_frame._set_block( + self._linked_frame._block.with_index_labels(names) + ) + self._block = new_block + return None + else: + return Index(new_block) def drop( self, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index be940a1e82..1d0d485392 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2082,15 +2082,67 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: return self._block.index.resolve_level(level) + @overload def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame: + ... + + @overload + def rename( + self, *, columns: Mapping[blocks.Label, blocks.Label], inplace: Literal[False] + ) -> DataFrame: + ... + + @overload + def rename( + self, *, columns: Mapping[blocks.Label, blocks.Label], inplace: Literal[True] + ) -> None: + ... + + def rename( + self, *, columns: Mapping[blocks.Label, blocks.Label], inplace: bool = False + ) -> Optional[DataFrame]: block = self._block.rename(columns=columns) - return DataFrame(block) + if inplace: + self._block = block + return None + else: + return DataFrame(block) + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + ) -> DataFrame: + ... + + @overload def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[False], **kwargs, ) -> DataFrame: + ... + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: bool = False, + **kwargs, + ) -> Optional[DataFrame]: if len(kwargs) != 0: raise NotImplementedError( f"rename_axis does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" @@ -2100,7 +2152,14 @@ def rename_axis( labels = mapper else: labels = [mapper] - return DataFrame(self._block.with_index_labels(labels)) + + block = self._block.with_index_labels(labels) + + if inplace: + self._block = block + return None + else: + return DataFrame(block) @validations.requires_ordering() def equals(self, other: typing.Union[bigframes.series.Series, DataFrame]) -> bool: diff --git a/bigframes/series.py b/bigframes/series.py index 866f4d0a5d..74e8d03c8d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -31,6 +31,7 @@ Literal, Mapping, Optional, + overload, Sequence, Tuple, Union, @@ -95,6 +96,10 @@ class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Ser # Must be above 5000 for pandas to delegate to bigframes for binops __pandas_priority__ = 13000 + # Ensure mypy can more robustly determine the type of self._block since it + # gets set in various places. + _block: blocks.Block + def __init__(self, *args, **kwargs): self._query_job: Optional[bigquery.QueryJob] = None super().__init__(*args, **kwargs) @@ -254,22 +259,45 @@ def __iter__(self) -> typing.Iterator: def copy(self) -> Series: return Series(self._block) + @overload def rename( - self, index: Union[blocks.Label, Mapping[Any, Any]] = None, **kwargs + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + ) -> Series: + ... + + @overload + def rename( + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + *, + inplace: Literal[False], + **kwargs, ) -> Series: + ... + + @overload + def rename( + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + *, + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + def rename( + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + *, + inplace: bool = False, + **kwargs, + ) -> Optional[Series]: if len(kwargs) != 0: raise NotImplementedError( f"rename does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" ) - # rename the Series name - if index is None or isinstance( - index, str - ): # Python 3.9 doesn't allow isinstance of Optional - index = typing.cast(Optional[str], index) - block = self._block.with_column_labels([index]) - return Series(block) - # rename the index if isinstance(index, Mapping): index = typing.cast(Mapping[Any, Any], index) @@ -294,22 +322,61 @@ def rename( block = block.set_index(new_idx_ids, index_labels=block.index.names) - return Series(block) + if inplace: + self._block = block + return None + else: + return Series(block) # rename the Series name if isinstance(index, typing.Hashable): + # Python 3.9 doesn't allow isinstance of Optional index = typing.cast(Optional[str], index) block = self._block.with_column_labels([index]) - return Series(block) + + if inplace: + self._block = block + return None + else: + return Series(block) raise ValueError(f"Unsupported type of parameter index: {type(index)}") - @validations.requires_index + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + ) -> Series: + ... + + @overload def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[False], **kwargs, ) -> Series: + ... + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @validations.requires_index + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: bool = False, + **kwargs, + ) -> Optional[Series]: if len(kwargs) != 0: raise NotImplementedError( f"rename_axis does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" @@ -319,7 +386,13 @@ def rename_axis( labels = mapper else: labels = [mapper] - return Series(self._block.with_index_labels(labels)) + + block = self._block.with_index_labels(labels) + if inplace: + self._block = block + return None + else: + return Series(block) def equals( self, other: typing.Union[Series, bigframes.dataframe.DataFrame] diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index 528835f6da..ca6fa57d0b 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -14,7 +14,7 @@ import copy import datetime -from typing import Optional, Sequence +from typing import Any, Dict, Optional, Sequence import unittest.mock as mock import google.auth.credentials @@ -23,12 +23,9 @@ import bigframes import bigframes.clients -import bigframes.core.ordering +import bigframes.core.global_session import bigframes.dataframe -import bigframes.series import bigframes.session.clients -import bigframes.session.executor -import bigframes.session.metrics """Utilities for creating test resources.""" @@ -129,7 +126,10 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs): def create_dataframe( - monkeypatch: pytest.MonkeyPatch, *, session: Optional[bigframes.Session] = None + monkeypatch: pytest.MonkeyPatch, + *, + session: Optional[bigframes.Session] = None, + data: Optional[Dict[str, Sequence[Any]]] = None, ) -> bigframes.dataframe.DataFrame: """[Experimental] Create a mock DataFrame that avoids making Google Cloud API calls. @@ -138,8 +138,11 @@ def create_dataframe( if session is None: session = create_bigquery_session() + if data is None: + data = {"col": []} + # Since this may create a ReadLocalNode, the session we explicitly pass in # might not actually be used. Mock out the global session, too. monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True - return bigframes.dataframe.DataFrame({"col": []}, session=session) + return bigframes.dataframe.DataFrame(data, session=session) diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index d592b49038..f8dda8da55 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -16,6 +16,7 @@ from typing import Optional, Union import weakref +import pandas import polars import bigframes @@ -87,5 +88,7 @@ def __init__(self): def read_pandas(self, pandas_dataframe, write_engine="default"): # override read_pandas to always keep data local-only + if isinstance(pandas_dataframe, pandas.Series): + pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) return bigframes.dataframe.DataFrame(local_block) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 9d67fd33b7..d630380e7a 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -90,6 +90,45 @@ def test_dataframe_to_gbq_writes_to_anonymous_dataset( assert destination.startswith(anonymous_dataset_id) +def test_dataframe_rename_columns(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"col1": [], "col2": [], "col3": []} + ) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + renamed = dataframe.rename(columns={"col1": "a", "col2": "b", "col3": "c"}) + assert renamed.columns.to_list() == ["a", "b", "c"] + + +def test_dataframe_rename_columns_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"col1": [], "col2": [], "col3": []} + ) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + assert ( + dataframe.rename(columns={"col1": "a", "col2": "b", "col3": "c"}, inplace=True) + is None + ) + assert dataframe.columns.to_list() == ["a", "b", "c"] + + +def test_dataframe_rename_axis(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"]) + assert list(dataframe.index.names) == ["index1", "index2"] + renamed = dataframe.rename_axis(["a", "b"]) + assert list(renamed.index.names) == ["a", "b"] + + +def test_dataframe_rename_axis_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"]) + assert list(dataframe.index.names) == ["index1", "index2"] + assert dataframe.rename_axis(["a", "b"], inplace=True) is None + assert list(dataframe.index.names) == ["a", "b"] + + def test_dataframe_semantics_property_future_warning( monkeypatch: pytest.MonkeyPatch, ): diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py new file mode 100644 index 0000000000..97f1e4419e --- /dev/null +++ b/tests/unit/test_index.py @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.testing import mocks + + +def test_index_rename(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"idx": [], "col": []} + ).set_index("idx") + index = dataframe.index + assert index.name == "idx" + renamed = index.rename("my_index_name") + assert renamed.name == "my_index_name" + + +def test_index_rename_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"idx": [], "col": []} + ).set_index("idx") + index = dataframe.index + assert index.name == "idx" + assert index.rename("my_index_name", inplace=True) is None + + # Make sure the linked DataFrame is updated, too. + assert dataframe.index.name == "my_index_name" + assert index.name == "my_index_name" diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index e36dc3df3c..509bc6ade2 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -79,6 +79,33 @@ def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_ses pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_polars_local_engine_series_rename_with_mapping(polars_session): + pd_series = pd.Series( + ["a", "b", "c"], index=[1, 2, 3], dtype="string[pyarrow]", name="test_name" + ) + bf_series = bpd.Series(pd_series, session=polars_session) + + bf_result = bf_series.rename({1: 100, 2: 200, 3: 300}).to_pandas() + pd_result = pd_series.rename({1: 100, 2: 200, 3: 300}) + # pd default index is int64, bf is Int64 + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_polars_local_engine_series_rename_with_mapping_inplace(polars_session): + pd_series = pd.Series( + ["a", "b", "c"], index=[1, 2, 3], dtype="string[pyarrow]", name="test_name" + ) + bf_series = bpd.Series(pd_series, session=polars_session) + + pd_series.rename({1: 100, 2: 200, 3: 300}, inplace=True) + assert bf_series.rename({1: 100, 2: 200, 3: 300}, inplace=True) is None + + bf_result = bf_series.to_pandas() + pd_result = pd_series + # pd default index is int64, bf is Int64 + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + def test_polars_local_engine_reset_index( small_inline_frame: pd.DataFrame, polars_session ): diff --git a/tests/unit/test_series.py b/tests/unit/test_series.py index 1409209c6c..8a083d7e4a 100644 --- a/tests/unit/test_series.py +++ b/tests/unit/test_series.py @@ -12,7 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import cast + +import pytest + import bigframes.series +from bigframes.testing import mocks + + +def test_series_rename(monkeypatch: pytest.MonkeyPatch): + series = cast(bigframes.series.Series, mocks.create_dataframe(monkeypatch)["col"]) + assert series.name == "col" + renamed = series.rename("renamed_col") + assert renamed.name == "renamed_col" + + +def test_series_rename_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + series = cast(bigframes.series.Series, mocks.create_dataframe(monkeypatch)["col"]) + assert series.name == "col" + assert series.rename("renamed_col", inplace=True) is None + assert series.name == "renamed_col" + + +def test_series_rename_axis(monkeypatch: pytest.MonkeyPatch): + series = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"])["col1"] + assert list(series.index.names) == ["index1", "index2"] + renamed = series.rename_axis(["a", "b"]) + assert list(renamed.index.names) == ["a", "b"] + + +def test_series_rename_axis_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + series = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"])["col1"] + assert list(series.index.names) == ["index1", "index2"] + assert series.rename_axis(["a", "b"], inplace=True) is None + assert list(series.index.names) == ["a", "b"] def test_series_repr_with_uninitialized_object(): diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 63142e4dd8..c1b5b5a86b 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,7 +11,7 @@ """ from __future__ import annotations -from typing import Hashable, Iterable, Literal, Mapping, Optional, Sequence, Union +from typing import Hashable, Iterable, Literal, Optional, Sequence, Union from bigframes_vendored import constants import bigframes_vendored.pandas.core.generic as generic @@ -1392,8 +1392,9 @@ def align( def rename( self, *, - columns: Mapping, - ) -> DataFrame: + columns, + inplace, + ): """Rename columns. Dict values must be unique (1-to-1). Labels not contained in a dict @@ -1426,16 +1427,20 @@ def rename( Args: columns (Mapping): Dict-like from old column labels to new column labels. + inplace (bool): + Default False. Whether to modify the DataFrame rather than + creating a new one. Returns: - bigframes.pandas.DataFrame: DataFrame with the renamed axis labels. + bigframes.pandas.DataFrame | None: + DataFrame with the renamed axis labels or None if ``inplace=True``. Raises: KeyError: If any of the labels is not found. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: + def rename_axis(self, mapper, *, inplace, **kwargs): """ Set the name of the axis for the index. @@ -1443,11 +1448,15 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Currently only accepts a single string parameter (the new name of the index). Args: - mapper str: + mapper (str): Value to set the axis name attribute. + inplace (bool): + Default False. Modifies the object directly, instead of + creating a new Series or DataFrame. Returns: - bigframes.pandas.DataFrame: DataFrame with the new index name + bigframes.pandas.DataFrame | None: + DataFrame with the new index name or None if ``inplace=True``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index be1c5034f9..7df1c7a9de 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -941,7 +941,7 @@ def fillna(self, value) -> Index: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename(self, name) -> Index: + def rename(self, name, *, inplace): """ Alter Index or MultiIndex name. @@ -960,10 +960,13 @@ def rename(self, name) -> Index: Args: name (label or list of labels): Name(s) to set. + inplace (bool): + Default False. Modifies the object directly, instead of + creating a new Index or MultiIndex. Returns: - bigframes.pandas.Index: - The same type as the caller. + bigframes.pandas.Index | None: + The same type as the caller or None if ``inplace=True``. Raises: ValueError: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 673a6f362f..61cd6a47bf 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -5257,7 +5257,7 @@ def argmin(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename(self, index, **kwargs) -> Series | None: + def rename(self, index, *, inplace, **kwargs): """ Alter Series index labels or name. @@ -5301,15 +5301,17 @@ def rename(self, index, **kwargs) -> Series | None: the index. Scalar or hashable sequence-like will alter the ``Series.name`` attribute. + inplace (bool): + Default False. Whether to return a new Series. Returns: - bigframes.pandas.Series: - Series with index labels. + bigframes.pandas.Series | None: + Series with index labels or None if ``inplace=True``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename_axis(self, mapper, **kwargs): + def rename_axis(self, mapper, *, inplace, **kwargs): """ Set the name of the axis for the index or columns. From 6e63eca29f20d83435878273604816ce7595c396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 30 May 2025 16:49:13 -0500 Subject: [PATCH 50/52] docs: integrations notebook extracts token from `bqclient._http.credentials` instead of `bqclient._credentials` (#1784) --- notebooks/dataframes/integrations.ipynb | 110 +++++++++++------------- 1 file changed, 49 insertions(+), 61 deletions(-) diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb index 9edb174f18..8c7790b1ea 100644 --- a/notebooks/dataframes/integrations.ipynb +++ b/notebooks/dataframes/integrations.ipynb @@ -66,9 +66,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/swast/src/bigframes-2/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" ] + }, + { + "data": { + "text/html": [ + "Query job 1772ca28-2ef5-425c-87fe-8227aeb9318c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -96,13 +108,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job eb7f3bbe-dda9-4d2f-b195-21de862d7055 is DONE. 0 Bytes processed. Open Job" + "Query job 33bd5814-b594-4ec4-baba-8f6b6e285e48 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -128,13 +140,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 4ad50c3c-91d0-4fef-91f6-0a2c5a30c38f is DONE. 0 Bytes processed. Open Job" + "Query job 1594d97a-1203-4c28-8730-caffb3ac4e9e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -146,10 +158,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_9a045ff143db4f8ab2018994287020f3'" + "'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_7578d5bd9949422599ccb9e4fe6451be'" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -172,13 +184,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 9e7d4b1a-d7fc-4599-bab4-40062c83288e is DONE. 0 Bytes processed. Open Job" + "Query job 8afc1538-9779-487a-a063-def5f438ee11 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -192,11 +204,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 3 4 -0.1250 d\n", - "1 1 2 -0.5000 b\n", + "0 1 2 -0.5000 b\n", + "1 2 3 0.2500 c\n", "2 0 1 1.0000 a\n", - "3 4 5 0.0625 e\n", - "4 2 3 0.2500 c\n" + "3 3 4 -0.1250 d\n", + "4 4 5 0.0625 e\n" ] } ], @@ -238,13 +250,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 62db313e-7632-4dbb-8eff-5035d0e6c27e is DONE. 0 Bytes processed. Open Job" + "Query job b6f68a49-5129-448d-bca3-62a23dced10d is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -258,11 +270,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 1 2 -0.5000 b\n", - "1 3 4 -0.1250 d\n", - "2 0 1 1.0000 a\n", - "3 4 5 0.0625 e\n", - "4 2 3 0.2500 c\n" + "0 3 4 -0.1250 d\n", + "1 1 2 -0.5000 b\n", + "2 4 5 0.0625 e\n", + "3 2 3 0.2500 c\n", + "4 0 1 1.0000 a\n" ] } ], @@ -274,7 +286,7 @@ " table_id = df.to_gbq()\n", "\n", " bqclient = df.bqclient\n", - " token = bqclient._credentials.token\n", + " token = bqclient._http.credentials.token\n", " project_id = bqclient.project\n", "\n", " share_table_and_start_workload(table_id, token, project_id)\n", @@ -335,13 +347,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 1cbd8898-97c7-419e-87af-b72a9432afb6 is DONE. 0 Bytes processed. Open Job" + "Query job 0f205180-cf26-46e5-950d-109947b7f5a1 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -353,10 +365,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_58b9b6fc0c3349bf8d3dd6fb29ab5322'" + "'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_240520e0723548f18fd3bd5d24cbbf82'" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -378,13 +390,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 40e54aa9-fad7-47c3-9bec-144f6c7106d8 is DONE. 0 Bytes processed. Open Job" + "Query job 80177f9a-4f6e-4a4e-97db-f119ea686c62 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -396,10 +408,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_cdb4f54063b0417a8309c462b70239fa'" + "'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_4ca41d2f28f84feca1bbafe9304fd89f'" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -427,16 +439,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" + "Dataset(DatasetReference('bigframes-dev', 'my_dataset'))" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -451,33 +463,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 73cf9e04-d5fa-4765-827c-665f0e6b9e00 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b177eb37-197f-4732-8978-c74cccb36e01 is DONE. 270 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -593,7 +581,7 @@ "[10 rows x 3 columns]" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -683,7 +671,7 @@ ], "metadata": { "kernelspec": { - "display_name": "bigframes", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -697,7 +685,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.10" } }, "nbformat": 4, From 611e43b156483848a5470f889fb7b2b473ecff4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 30 May 2025 16:50:11 -0500 Subject: [PATCH 51/52] docs: add MatrixFactorization to the table of contents (#1725) --- docs/templates/toc.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index f70e81d196..a27f162a9a 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -115,6 +115,8 @@ uid: bigframes.ml.decomposition - name: PCA uid: bigframes.ml.decomposition.PCA + - name: MatrixFactorization + uid: bigframes.ml.decomposition.MatrixFactorization name: decomposition - items: - name: Overview From e3952e65f34f3b27ed43fdeb1f980f2b6de06f83 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 2 Jun 2025 09:14:25 -0500 Subject: [PATCH 52/52] chore(main): release 2.5.0 (#1724) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 60 +++++++++++++++++++++++ bigframes/version.py | 4 +- third_party/bigframes_vendored/version.py | 4 +- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fafd11c8a..84dd3f36c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,66 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.4.0...v2.5.0) (2025-05-30) + + +### ⚠ BREAKING CHANGES + +* the updated `ai.map()` parameter list is not backward-compatible + +### Features + +* Add `bpd.options.bigquery.requests_transport_adapters` option ([#1755](https://github.com/googleapis/python-bigquery-dataframes/issues/1755)) ([bb45db8](https://github.com/googleapis/python-bigquery-dataframes/commit/bb45db8afdffa1417f11c050d40d4ec6d15b8654)) +* Add bbq.json_query and warn bbq.json_extract deprecated ([#1756](https://github.com/googleapis/python-bigquery-dataframes/issues/1756)) ([ec81dd2](https://github.com/googleapis/python-bigquery-dataframes/commit/ec81dd2228697d5bf193d86396cf7f3212e0289d)) +* Add bpd.options.reset() method ([#1743](https://github.com/googleapis/python-bigquery-dataframes/issues/1743)) ([36c359d](https://github.com/googleapis/python-bigquery-dataframes/commit/36c359d2521089e186a412d353daf9de6cfbc8f4)) +* Add DataFrame.round method ([#1742](https://github.com/googleapis/python-bigquery-dataframes/issues/1742)) ([3ea6043](https://github.com/googleapis/python-bigquery-dataframes/commit/3ea6043be7025fa7a11cca27b02f5505bbc9b129)) +* Add deferred data uploading ([#1720](https://github.com/googleapis/python-bigquery-dataframes/issues/1720)) ([1f6442e](https://github.com/googleapis/python-bigquery-dataframes/commit/1f6442e576c35ec784ccf9cab3d081d46e45a5ce)) +* Add deprecation warning to Gemini-1.5-X, text-embedding-004, and remove remove legacy models in notebooks and docs ([#1723](https://github.com/googleapis/python-bigquery-dataframes/issues/1723)) ([80aad9a](https://github.com/googleapis/python-bigquery-dataframes/commit/80aad9af794c2e06d1608c879f459a836fd4448b)) +* Add structured output for ai map, ai filter and ai join ([#1746](https://github.com/googleapis/python-bigquery-dataframes/issues/1746)) ([133ac6b](https://github.com/googleapis/python-bigquery-dataframes/commit/133ac6b0e1f1e7a12844a4b6fd5b26df59f7ef37)) +* Add support for df.loc[list, column(s)] ([#1761](https://github.com/googleapis/python-bigquery-dataframes/issues/1761)) ([768a757](https://github.com/googleapis/python-bigquery-dataframes/commit/768a7570845c4eb88f495d7f3c0f3158accdc231)) +* Include bq schema and query string in dry run results ([#1752](https://github.com/googleapis/python-bigquery-dataframes/issues/1752)) ([bb51147](https://github.com/googleapis/python-bigquery-dataframes/commit/bb511475b74cc253230725846098a9045be2e324)) +* Support `inplace=True` in `rename` and `rename_axis` ([#1744](https://github.com/googleapis/python-bigquery-dataframes/issues/1744)) ([734cc65](https://github.com/googleapis/python-bigquery-dataframes/commit/734cc652e435dc5d97a23411735aa51b7824e381)) +* Support `unique()` for Index ([#1750](https://github.com/googleapis/python-bigquery-dataframes/issues/1750)) ([27fac78](https://github.com/googleapis/python-bigquery-dataframes/commit/27fac78cb5654e5655aec861062837a7d4f3f679)) +* Support astype conversions to and from JSON dtypes ([#1716](https://github.com/googleapis/python-bigquery-dataframes/issues/1716)) ([8ef4de1](https://github.com/googleapis/python-bigquery-dataframes/commit/8ef4de10151717f88364a909b29fa7600e959ada)) +* Support dict param for dataframe.agg() ([#1772](https://github.com/googleapis/python-bigquery-dataframes/issues/1772)) ([f9c29c8](https://github.com/googleapis/python-bigquery-dataframes/commit/f9c29c85053d8111a74ce382490daed36f8bb35b)) +* Support dtype parameter in read_csv for bigquery engine ([#1749](https://github.com/googleapis/python-bigquery-dataframes/issues/1749)) ([50dca4c](https://github.com/googleapis/python-bigquery-dataframes/commit/50dca4c706d78673b03f90eccf776118247ba30b)) +* Use read api for some peek ops ([#1731](https://github.com/googleapis/python-bigquery-dataframes/issues/1731)) ([108f4d2](https://github.com/googleapis/python-bigquery-dataframes/commit/108f4d259e1bcfbe6c7aa3c3c3f8f605cf7615ee)) + + +### Bug Fixes + +* Fix clip int series with float bounds ([#1739](https://github.com/googleapis/python-bigquery-dataframes/issues/1739)) ([d451aef](https://github.com/googleapis/python-bigquery-dataframes/commit/d451aefd2181aef250c3b48cceac09063081cab2)) +* Fix error with self-merge operations ([#1774](https://github.com/googleapis/python-bigquery-dataframes/issues/1774)) ([e5fe143](https://github.com/googleapis/python-bigquery-dataframes/commit/e5fe14339b4a40ab4a25657ee0453e4108cf8bba)) +* Fix the default value for na_value for numpy conversions ([#1766](https://github.com/googleapis/python-bigquery-dataframes/issues/1766)) ([0629cac](https://github.com/googleapis/python-bigquery-dataframes/commit/0629cac7f9a9370a72c1ae25e014eb478a4c8c08)) +* Include location in Session-based temporary storage manager DDL queries ([#1780](https://github.com/googleapis/python-bigquery-dataframes/issues/1780)) ([acba032](https://github.com/googleapis/python-bigquery-dataframes/commit/acba0321cafeb49f3e560a364ebbf3d15fb8af88)) +* Prevent creating unnecessary client objects in multithreaded environments ([#1757](https://github.com/googleapis/python-bigquery-dataframes/issues/1757)) ([1cf9f5e](https://github.com/googleapis/python-bigquery-dataframes/commit/1cf9f5e8dba733ee26d15fc5edc44c81e094e9a0)) +* Reduce bigquery table modification via DML for to_gbq ([#1737](https://github.com/googleapis/python-bigquery-dataframes/issues/1737)) ([545cdca](https://github.com/googleapis/python-bigquery-dataframes/commit/545cdcac1361607678c2574f0f31eb43950073e5)) +* Stop ignoring arguments to `MatrixFactorization.score(X, y)` ([#1726](https://github.com/googleapis/python-bigquery-dataframes/issues/1726)) ([55c07e9](https://github.com/googleapis/python-bigquery-dataframes/commit/55c07e9d4315949c37ffa3e03c8fedc6daf17faf)) +* Support JSON and STRUCT for bbq.sql_scalar ([#1754](https://github.com/googleapis/python-bigquery-dataframes/issues/1754)) ([190390b](https://github.com/googleapis/python-bigquery-dataframes/commit/190390b804c2131c2eaa624d7f025febb7784b01)) +* Support str.replace re.compile with flags ([#1736](https://github.com/googleapis/python-bigquery-dataframes/issues/1736)) ([f8d2cd2](https://github.com/googleapis/python-bigquery-dataframes/commit/f8d2cd24281415f4a8f9193b676f5483128cd173)) + + +### Performance Improvements + +* Faster local data comparison using idenitity ([#1738](https://github.com/googleapis/python-bigquery-dataframes/issues/1738)) ([2858b1e](https://github.com/googleapis/python-bigquery-dataframes/commit/2858b1efb4fe74097dcb17c086ee1dc18e53053c)) +* Optimize repr for unordered gbq table ([#1778](https://github.com/googleapis/python-bigquery-dataframes/issues/1778)) ([2bc4fbc](https://github.com/googleapis/python-bigquery-dataframes/commit/2bc4fbc78eba4bb2ee335e0475700a7ca5bc84d7)) +* Use JOB_CREATION_OPTIONAL when `allow_large_results=False` ([#1763](https://github.com/googleapis/python-bigquery-dataframes/issues/1763)) ([15f3f2a](https://github.com/googleapis/python-bigquery-dataframes/commit/15f3f2aa42cfe4a2233f62c5f8906e7f7658f9fa)) + + +### Dependencies + +* Avoid `gcsfs==2025.5.0` ([#1762](https://github.com/googleapis/python-bigquery-dataframes/issues/1762)) ([68d5e2c](https://github.com/googleapis/python-bigquery-dataframes/commit/68d5e2cbef3510cadc7e9dd199117c1e3b02d19f)) + + +### Documentation + +* Add llm output_schema notebook ([#1732](https://github.com/googleapis/python-bigquery-dataframes/issues/1732)) ([b2261cc](https://github.com/googleapis/python-bigquery-dataframes/commit/b2261cc07cd58b51d212f9bf495c5022e587f816)) +* Add MatrixFactorization to the table of contents ([#1725](https://github.com/googleapis/python-bigquery-dataframes/issues/1725)) ([611e43b](https://github.com/googleapis/python-bigquery-dataframes/commit/611e43b156483848a5470f889fb7b2b473ecff4d)) +* Fix typo for "population" in the `GeminiTextGenerator.predict(..., output_schema={...})` sample notebook ([#1748](https://github.com/googleapis/python-bigquery-dataframes/issues/1748)) ([bd07e05](https://github.com/googleapis/python-bigquery-dataframes/commit/bd07e05d26820313c052eaf41c267a1ab20b4fc6)) +* Integrations notebook extracts token from `bqclient._http.credentials` instead of `bqclient._credentials` ([#1784](https://github.com/googleapis/python-bigquery-dataframes/issues/1784)) ([6e63eca](https://github.com/googleapis/python-bigquery-dataframes/commit/6e63eca29f20d83435878273604816ce7595c396)) +* Updated multimodal notebook instructions ([#1745](https://github.com/googleapis/python-bigquery-dataframes/issues/1745)) ([1df8ca6](https://github.com/googleapis/python-bigquery-dataframes/commit/1df8ca6312ee428d55c2091a00c73b13d9a6b193)) +* Use partial ordering mode in the quickstart sample ([#1734](https://github.com/googleapis/python-bigquery-dataframes/issues/1734)) ([476b7dd](https://github.com/googleapis/python-bigquery-dataframes/commit/476b7dd7c2639cb6804272d06aa5c1db666819da)) + ## [2.4.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.3.0...v2.4.0) (2025-05-12) diff --git a/bigframes/version.py b/bigframes/version.py index 3a34d3d7bb..6cc3d952ed 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.4.0" +__version__ = "2.5.0" # {x-release-please-start-date} -__release_date__ = "2025-05-12" +__release_date__ = "2025-05-30" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 3a34d3d7bb..6cc3d952ed 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.4.0" +__version__ = "2.5.0" # {x-release-please-start-date} -__release_date__ = "2025-05-12" +__release_date__ = "2025-05-30" # {x-release-please-end}