diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fafd11c8a..84dd3f36c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,66 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.4.0...v2.5.0) (2025-05-30) + + +### ⚠ BREAKING CHANGES + +* the updated `ai.map()` parameter list is not backward-compatible + +### Features + +* Add `bpd.options.bigquery.requests_transport_adapters` option ([#1755](https://github.com/googleapis/python-bigquery-dataframes/issues/1755)) ([bb45db8](https://github.com/googleapis/python-bigquery-dataframes/commit/bb45db8afdffa1417f11c050d40d4ec6d15b8654)) +* Add bbq.json_query and warn bbq.json_extract deprecated ([#1756](https://github.com/googleapis/python-bigquery-dataframes/issues/1756)) ([ec81dd2](https://github.com/googleapis/python-bigquery-dataframes/commit/ec81dd2228697d5bf193d86396cf7f3212e0289d)) +* Add bpd.options.reset() method ([#1743](https://github.com/googleapis/python-bigquery-dataframes/issues/1743)) ([36c359d](https://github.com/googleapis/python-bigquery-dataframes/commit/36c359d2521089e186a412d353daf9de6cfbc8f4)) +* Add DataFrame.round method ([#1742](https://github.com/googleapis/python-bigquery-dataframes/issues/1742)) ([3ea6043](https://github.com/googleapis/python-bigquery-dataframes/commit/3ea6043be7025fa7a11cca27b02f5505bbc9b129)) +* Add deferred data uploading ([#1720](https://github.com/googleapis/python-bigquery-dataframes/issues/1720)) ([1f6442e](https://github.com/googleapis/python-bigquery-dataframes/commit/1f6442e576c35ec784ccf9cab3d081d46e45a5ce)) +* Add deprecation warning to Gemini-1.5-X, text-embedding-004, and remove remove legacy models in notebooks and docs ([#1723](https://github.com/googleapis/python-bigquery-dataframes/issues/1723)) ([80aad9a](https://github.com/googleapis/python-bigquery-dataframes/commit/80aad9af794c2e06d1608c879f459a836fd4448b)) +* Add structured output for ai map, ai filter and ai join ([#1746](https://github.com/googleapis/python-bigquery-dataframes/issues/1746)) ([133ac6b](https://github.com/googleapis/python-bigquery-dataframes/commit/133ac6b0e1f1e7a12844a4b6fd5b26df59f7ef37)) +* Add support for df.loc[list, column(s)] ([#1761](https://github.com/googleapis/python-bigquery-dataframes/issues/1761)) ([768a757](https://github.com/googleapis/python-bigquery-dataframes/commit/768a7570845c4eb88f495d7f3c0f3158accdc231)) +* Include bq schema and query string in dry run results ([#1752](https://github.com/googleapis/python-bigquery-dataframes/issues/1752)) ([bb51147](https://github.com/googleapis/python-bigquery-dataframes/commit/bb511475b74cc253230725846098a9045be2e324)) +* Support `inplace=True` in `rename` and `rename_axis` ([#1744](https://github.com/googleapis/python-bigquery-dataframes/issues/1744)) ([734cc65](https://github.com/googleapis/python-bigquery-dataframes/commit/734cc652e435dc5d97a23411735aa51b7824e381)) +* Support `unique()` for Index ([#1750](https://github.com/googleapis/python-bigquery-dataframes/issues/1750)) ([27fac78](https://github.com/googleapis/python-bigquery-dataframes/commit/27fac78cb5654e5655aec861062837a7d4f3f679)) +* Support astype conversions to and from JSON dtypes ([#1716](https://github.com/googleapis/python-bigquery-dataframes/issues/1716)) ([8ef4de1](https://github.com/googleapis/python-bigquery-dataframes/commit/8ef4de10151717f88364a909b29fa7600e959ada)) +* Support dict param for dataframe.agg() ([#1772](https://github.com/googleapis/python-bigquery-dataframes/issues/1772)) ([f9c29c8](https://github.com/googleapis/python-bigquery-dataframes/commit/f9c29c85053d8111a74ce382490daed36f8bb35b)) +* Support dtype parameter in read_csv for bigquery engine ([#1749](https://github.com/googleapis/python-bigquery-dataframes/issues/1749)) ([50dca4c](https://github.com/googleapis/python-bigquery-dataframes/commit/50dca4c706d78673b03f90eccf776118247ba30b)) +* Use read api for some peek ops ([#1731](https://github.com/googleapis/python-bigquery-dataframes/issues/1731)) ([108f4d2](https://github.com/googleapis/python-bigquery-dataframes/commit/108f4d259e1bcfbe6c7aa3c3c3f8f605cf7615ee)) + + +### Bug Fixes + +* Fix clip int series with float bounds ([#1739](https://github.com/googleapis/python-bigquery-dataframes/issues/1739)) ([d451aef](https://github.com/googleapis/python-bigquery-dataframes/commit/d451aefd2181aef250c3b48cceac09063081cab2)) +* Fix error with self-merge operations ([#1774](https://github.com/googleapis/python-bigquery-dataframes/issues/1774)) ([e5fe143](https://github.com/googleapis/python-bigquery-dataframes/commit/e5fe14339b4a40ab4a25657ee0453e4108cf8bba)) +* Fix the default value for na_value for numpy conversions ([#1766](https://github.com/googleapis/python-bigquery-dataframes/issues/1766)) ([0629cac](https://github.com/googleapis/python-bigquery-dataframes/commit/0629cac7f9a9370a72c1ae25e014eb478a4c8c08)) +* Include location in Session-based temporary storage manager DDL queries ([#1780](https://github.com/googleapis/python-bigquery-dataframes/issues/1780)) ([acba032](https://github.com/googleapis/python-bigquery-dataframes/commit/acba0321cafeb49f3e560a364ebbf3d15fb8af88)) +* Prevent creating unnecessary client objects in multithreaded environments ([#1757](https://github.com/googleapis/python-bigquery-dataframes/issues/1757)) ([1cf9f5e](https://github.com/googleapis/python-bigquery-dataframes/commit/1cf9f5e8dba733ee26d15fc5edc44c81e094e9a0)) +* Reduce bigquery table modification via DML for to_gbq ([#1737](https://github.com/googleapis/python-bigquery-dataframes/issues/1737)) ([545cdca](https://github.com/googleapis/python-bigquery-dataframes/commit/545cdcac1361607678c2574f0f31eb43950073e5)) +* Stop ignoring arguments to `MatrixFactorization.score(X, y)` ([#1726](https://github.com/googleapis/python-bigquery-dataframes/issues/1726)) ([55c07e9](https://github.com/googleapis/python-bigquery-dataframes/commit/55c07e9d4315949c37ffa3e03c8fedc6daf17faf)) +* Support JSON and STRUCT for bbq.sql_scalar ([#1754](https://github.com/googleapis/python-bigquery-dataframes/issues/1754)) ([190390b](https://github.com/googleapis/python-bigquery-dataframes/commit/190390b804c2131c2eaa624d7f025febb7784b01)) +* Support str.replace re.compile with flags ([#1736](https://github.com/googleapis/python-bigquery-dataframes/issues/1736)) ([f8d2cd2](https://github.com/googleapis/python-bigquery-dataframes/commit/f8d2cd24281415f4a8f9193b676f5483128cd173)) + + +### Performance Improvements + +* Faster local data comparison using idenitity ([#1738](https://github.com/googleapis/python-bigquery-dataframes/issues/1738)) ([2858b1e](https://github.com/googleapis/python-bigquery-dataframes/commit/2858b1efb4fe74097dcb17c086ee1dc18e53053c)) +* Optimize repr for unordered gbq table ([#1778](https://github.com/googleapis/python-bigquery-dataframes/issues/1778)) ([2bc4fbc](https://github.com/googleapis/python-bigquery-dataframes/commit/2bc4fbc78eba4bb2ee335e0475700a7ca5bc84d7)) +* Use JOB_CREATION_OPTIONAL when `allow_large_results=False` ([#1763](https://github.com/googleapis/python-bigquery-dataframes/issues/1763)) ([15f3f2a](https://github.com/googleapis/python-bigquery-dataframes/commit/15f3f2aa42cfe4a2233f62c5f8906e7f7658f9fa)) + + +### Dependencies + +* Avoid `gcsfs==2025.5.0` ([#1762](https://github.com/googleapis/python-bigquery-dataframes/issues/1762)) ([68d5e2c](https://github.com/googleapis/python-bigquery-dataframes/commit/68d5e2cbef3510cadc7e9dd199117c1e3b02d19f)) + + +### Documentation + +* Add llm output_schema notebook ([#1732](https://github.com/googleapis/python-bigquery-dataframes/issues/1732)) ([b2261cc](https://github.com/googleapis/python-bigquery-dataframes/commit/b2261cc07cd58b51d212f9bf495c5022e587f816)) +* Add MatrixFactorization to the table of contents ([#1725](https://github.com/googleapis/python-bigquery-dataframes/issues/1725)) ([611e43b](https://github.com/googleapis/python-bigquery-dataframes/commit/611e43b156483848a5470f889fb7b2b473ecff4d)) +* Fix typo for "population" in the `GeminiTextGenerator.predict(..., output_schema={...})` sample notebook ([#1748](https://github.com/googleapis/python-bigquery-dataframes/issues/1748)) ([bd07e05](https://github.com/googleapis/python-bigquery-dataframes/commit/bd07e05d26820313c052eaf41c267a1ab20b4fc6)) +* Integrations notebook extracts token from `bqclient._http.credentials` instead of `bqclient._credentials` ([#1784](https://github.com/googleapis/python-bigquery-dataframes/issues/1784)) ([6e63eca](https://github.com/googleapis/python-bigquery-dataframes/commit/6e63eca29f20d83435878273604816ce7595c396)) +* Updated multimodal notebook instructions ([#1745](https://github.com/googleapis/python-bigquery-dataframes/issues/1745)) ([1df8ca6](https://github.com/googleapis/python-bigquery-dataframes/commit/1df8ca6312ee428d55c2091a00c73b13d9a6b193)) +* Use partial ordering mode in the quickstart sample ([#1734](https://github.com/googleapis/python-bigquery-dataframes/issues/1734)) ([476b7dd](https://github.com/googleapis/python-bigquery-dataframes/commit/476b7dd7c2639cb6804272d06aa5c1db666819da)) + ## [2.4.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.3.0...v2.4.0) (2025-05-12) diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index 775ef70bc7..52b47e3e9a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -56,12 +56,21 @@ class Options: """Global options affecting BigQuery DataFrames behavior.""" def __init__(self): + self.reset() + + def reset(self) -> Options: + """Reset the option settings to defaults. + + Returns: + bigframes._config.Options: Options object with default values. + """ self._local = ThreadLocalConfig() # BigQuery options are special because they can only be set once per # session, so we need an indicator as to whether we are using the # thread-local session or the global session. self._bigquery_options = bigquery_options.BigQueryOptions() + return self def _init_bigquery_thread_local(self): """Initialize thread-local options, based on current global options.""" diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 3a6008eaa8..d591ea85b3 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -16,10 +16,11 @@ from __future__ import annotations -from typing import Literal, Optional +from typing import Literal, Optional, Sequence, Tuple import warnings import google.auth.credentials +import requests.adapters import bigframes.enums import bigframes.exceptions as bfe @@ -90,6 +91,9 @@ def __init__( allow_large_results: bool = False, ordering_mode: Literal["strict", "partial"] = "strict", client_endpoints_override: Optional[dict] = None, + requests_transport_adapters: Sequence[ + Tuple[str, requests.adapters.BaseAdapter] + ] = (), ): self._credentials = credentials self._project = project @@ -100,6 +104,7 @@ def __init__( self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check self._allow_large_results = allow_large_results + self._requests_transport_adapters = requests_transport_adapters self._session_started = False # Determines the ordering strictness for the session. self._ordering_mode = _validate_ordering_mode(ordering_mode) @@ -379,3 +384,43 @@ def client_endpoints_override(self, value: dict): ) self._client_endpoints_override = value + + @property + def requests_transport_adapters( + self, + ) -> Sequence[Tuple[str, requests.adapters.BaseAdapter]]: + """Transport adapters for requests-based REST clients such as the + google-cloud-bigquery package. + + For more details, see the explanation in `requests guide to transport + adapters + `_. + + **Examples:** + + Increase the connection pool size using the requests `HTTPAdapter + `_. + + >>> import bigframes.pandas as bpd + >>> bpd.options.bigquery.requests_transport_adapters = ( + ... ("http://", requests.adapters.HTTPAdapter(pool_maxsize=100)), + ... ("https://", requests.adapters.HTTPAdapter(pool_maxsize=100)), + ... ) # doctest: +SKIP + + Returns: + Sequence[Tuple[str, requests.adapters.BaseAdapter]]: + Prefixes and corresponding transport adapters to `mount + `_ + in requests-based REST clients. + """ + return self._requests_transport_adapters + + @requests_transport_adapters.setter + def requests_transport_adapters( + self, value: Sequence[Tuple[str, requests.adapters.BaseAdapter]] + ) -> None: + if self._session_started and self._requests_transport_adapters != value: + raise ValueError( + SESSION_STARTED_MESSAGE.format(attribute="requests_transport_adapters") + ) + self._requests_transport_adapters = value diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 02dd77fdd9..301207bb31 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -37,6 +37,7 @@ json_extract, json_extract_array, json_extract_string_array, + json_query, json_set, json_value, parse_json, @@ -58,10 +59,11 @@ "st_distance", "st_intersection", # json ops - "json_set", "json_extract", "json_extract_array", "json_extract_string_array", + "json_query", + "json_set", "json_value", "parse_json", # search ops diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index b59fe40d99..561fb57348 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -22,9 +22,11 @@ from __future__ import annotations from typing import Any, cast, Optional, Sequence, Tuple, Union +import warnings import bigframes.core.utils as utils import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.operations as ops import bigframes.series as series @@ -87,9 +89,13 @@ def json_extract( input: series.Series, json_path: str, ) -> series.Series: - """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` - value. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. + """Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING`` or + ``JSON`` value. This function uses single quotes and brackets to escape invalid + JSONPath characters in JSON keys. + + .. deprecated:: 2.5.0 + The ``json_extract`` is deprecated and will be removed in a future version. + Use ``json_query`` instead. **Examples:** @@ -111,6 +117,11 @@ def json_extract( Returns: bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. """ + msg = ( + "The `json_extract` is deprecated and will be removed in a future version. " + "Use `json_query` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) return input._apply_unary_op(ops.JSONExtract(json_path=json_path)) @@ -231,6 +242,37 @@ def json_extract_string_array( return array_series +def json_query( + input: series.Series, + json_path: str, +) -> series.Series: + """Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING`` + or ``JSON`` value. This function uses double quotes to escape invalid JSONPath + characters in JSON keys. For example: ``"a.b"``. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) + >>> bbq.json_query(s, json_path="$.class") + 0 {"students":[{"id":5},{"id":12}]} + dtype: string + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return input._apply_unary_op(ops.JSONQuery(json_path=json_path)) + + def json_value( input: series.Series, json_path: str, diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 7ccf63fcda..a84c074e01 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -20,6 +20,7 @@ import google.cloud.bigquery +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.sql import bigframes.dataframe import bigframes.dtypes @@ -72,16 +73,16 @@ def sql_scalar( # Another benefit of this is that if there is a syntax error in the SQL # template, then this will fail with an error earlier in the process, # aiding users in debugging. - base_series = columns[0] - literals = [ - bigframes.dtypes.bigframes_dtype_to_literal(column.dtype) for column in columns + literals_sql = [ + sqlglot_ir._literal(None, column.dtype).sql(dialect="bigquery") + for column in columns ] - literals_sql = [bigframes.core.sql.simple_literal(literal) for literal in literals] + select_sql = sql_template.format(*literals_sql) + dry_run_sql = f"SELECT {select_sql}" # Use the executor directly, because we want the original column IDs, not # the user-friendly column names that block.to_sql_query() would produce. - select_sql = sql_template.format(*literals_sql) - dry_run_sql = f"SELECT {select_sql}" + base_series = columns[0] bqclient = base_series._session.bqclient job = bqclient.query( dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True) diff --git a/bigframes/constants.py b/bigframes/constants.py index 89f27afd78..b6e0b8b221 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -128,4 +128,8 @@ # BigQuery default is 10000, leave 100 for overhead MAX_COLUMNS = 9900 +# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table. +# Also must assume that text encoding as literals is much less efficient than in-memory representation. +MAX_INLINE_BYTES = 5000 + SUGGEST_PEEK_PREVIEW = "Use .peek(n) to preview n arbitrary rows." diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 60f5315554..20773fd1b4 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -133,8 +133,17 @@ def from_table( ordering=ordering, n_rows=n_rows, ) + return cls.from_bq_data_source(source_def, scan_list, session) + + @classmethod + def from_bq_data_source( + cls, + source: nodes.BigqueryDataSource, + scan_list: nodes.ScanList, + session: Session, + ): node = nodes.ReadTableNode( - source=source_def, + source=source, scan_list=scan_list, table_session=session, ) diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py index 5509adc0ea..45e3c40701 100644 --- a/bigframes/core/bigframe_node.py +++ b/bigframes/core/bigframe_node.py @@ -22,7 +22,7 @@ import typing from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Set, Tuple -from bigframes.core import identifiers +from bigframes.core import field, identifiers import bigframes.core.schema as schemata import bigframes.dtypes @@ -34,23 +34,6 @@ T = typing.TypeVar("T") -@dataclasses.dataclass(frozen=True) -class Field: - id: identifiers.ColumnId - dtype: bigframes.dtypes.Dtype - # Best effort, nullable=True if not certain - nullable: bool = True - - def with_nullable(self) -> Field: - return Field(self.id, self.dtype, nullable=True) - - def with_nonnull(self) -> Field: - return Field(self.id, self.dtype, nullable=False) - - def with_id(self, id: identifiers.ColumnId) -> Field: - return Field(id, self.dtype, nullable=self.nullable) - - @dataclasses.dataclass(eq=False, frozen=True) class BigFrameNode: """ @@ -162,7 +145,7 @@ def roots(self) -> typing.Set[BigFrameNode]: # TODO: Store some local data lazily for select, aggregate nodes. @property @abc.abstractmethod - def fields(self) -> Sequence[Field]: + def fields(self) -> Sequence[field.Field]: ... @property @@ -292,7 +275,7 @@ def _dtype_lookup(self) -> dict[identifiers.ColumnId, bigframes.dtypes.Dtype]: return {field.id: field.dtype for field in self.fields} @functools.cached_property - def field_by_id(self) -> Mapping[identifiers.ColumnId, Field]: + def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]: return {field.id: field for field in self.fields} # Plan algorithms diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d3107a0623..35cb7d41ae 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -153,6 +153,7 @@ def __init__( self._stats_cache[" ".join(self.index_columns)] = {} self._transpose_cache: Optional[Block] = transpose_cache + self._view_ref: Optional[bigquery.TableReference] = None @classmethod def from_local( @@ -2165,7 +2166,7 @@ def merge( result_columns.append(get_column_left[col_id]) for col_id in other.value_columns: if col_id in right_join_ids: - if other.col_id_to_label[matching_right_id] in matching_join_labels: + if other.col_id_to_label[col_id] in matching_join_labels: pass else: result_columns.append(get_column_right[col_id]) @@ -2487,6 +2488,17 @@ def to_sql_query( idx_labels, ) + def to_view(self, include_index: bool) -> bigquery.TableReference: + """ + Creates a temporary BigQuery VIEW with the SQL corresponding to this block. + """ + if self._view_ref is not None: + return self._view_ref + + sql, _, _ = self.to_sql_query(include_index=include_index) + self._view_ref = self.session._create_temp_view(sql) + return self._view_ref + def cached(self, *, force: bool = False, session_aware: bool = False) -> None: """Write the block to a session table.""" # use a heuristic for whether something needs to be cached @@ -2644,9 +2656,8 @@ def _get_rows_as_json_values(self) -> Block: SELECT {select_columns_csv} FROM T1 """ # The only ways this code is used is through df.apply(axis=1) cope path - # TODO: Stop using internal API destination, query_job = self.session._loader._query_to_destination( - json_sql, cluster_candidates=[ordering_column_name], api_name="apply" + json_sql, cluster_candidates=[ordering_column_name] ) if not destination: raise ValueError(f"Query job {query_job} did not produce result table") diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index b2f018e80a..14d8e8501c 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -92,6 +92,8 @@ def _( return args[0] < args[1] if isinstance(op, ops.eq_op.__class__): return args[0] == args[1] + if isinstance(op, ops.ne_op.__class__): + return args[0] != args[1] if isinstance(op, ops.mod_op.__class__): return args[0] % args[1] if isinstance(op, ops.coalesce_op.__class__): @@ -101,6 +103,9 @@ def _( for pred, result in zip(args[2::2], args[3::2]): return expr.when(pred).then(result) return expr + if isinstance(op, ops.where_op.__class__): + original, condition, otherwise = args + return pl.when(condition).then(original).otherwise(otherwise) raise NotImplementedError(f"Polars compiler hasn't implemented {op}") @dataclasses.dataclass(frozen=True) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 78e373121e..7707f16dad 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1164,6 +1164,35 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): elif to_type == ibis_dtypes.time: return x_converted.time() + if to_type == ibis_dtypes.json: + if x.type() == ibis_dtypes.string: + return parse_json_in_safe(x) if op.safe else parse_json(x) + if x.type() == ibis_dtypes.bool: + x_bool = typing.cast( + ibis_types.StringValue, + bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ), + ).lower() + return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) + if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): + x_str = bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ) + return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) + + if x.type() == ibis_dtypes.json: + if to_type == ibis_dtypes.int64: + return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) + if to_type == ibis_dtypes.float64: + return ( + cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) + ) + if to_type == ibis_dtypes.bool: + return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) + if to_type == ibis_dtypes.string: + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) @@ -1327,6 +1356,19 @@ def json_extract_string_array_op_impl( return json_extract_string_array(json_obj=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONQuery, pass_op=True) +def json_query_op_impl(x: ibis_types.Value, op: ops.JSONQuery): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query.__annotations__["return"] = return_type + json_query_op = ibis_udf.scalar.builtin(json_query) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + @scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): return parse_json(json_str=x) @@ -1797,6 +1839,13 @@ def fillna_op( @scalar_op_compiler.register_binary_op(ops.round_op) def round_op(x: ibis_types.Value, y: ibis_types.Value): + if x.type().is_integer(): + # bq produces float64, but pandas returns int + return ( + typing.cast(ibis_types.NumericValue, x) + .round(digits=typing.cast(ibis_types.IntegerValue, y)) + .cast(ibis_dtypes.int64) + ) return typing.cast(ibis_types.NumericValue, x).round( digits=typing.cast(ibis_types.IntegerValue, y) ) @@ -1899,34 +1948,18 @@ def clip_op( if isinstance(lower, ibis_types.NullScalar) and ( not isinstance(upper, ibis_types.NullScalar) ): - return ( - ibis_api.case() # type: ignore - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) + return ibis_api.least(original, upper) elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( upper, ibis_types.NullScalar ): - return ( - ibis_api.case() # type: ignore - .when(lower.isnull() | (original < lower), lower) - .else_(original) - .end() - ) + return ibis_api.greatest(original, lower) elif isinstance(lower, ibis_types.NullScalar) and ( isinstance(upper, ibis_types.NullScalar) ): return original else: # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound - return ( - ibis_api.case() # type: ignore - .when(lower.isnull() | (original < lower), lower) - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) + return ibis_api.greatest(ibis_api.least(original, upper), lower) # N-ary Operations @@ -2047,6 +2080,11 @@ def parse_json(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] """Converts a JSON-formatted STRING value to a JSON value.""" +@ibis_udf.scalar.builtin(name="SAFE.PARSE_JSON") +def parse_json_in_safe(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] + """Converts a JSON-formatted STRING value to a JSON value in the safe mode.""" + + @ibis_udf.scalar.builtin(name="json_set") def json_set( # type: ignore[empty-body] json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String, json_value @@ -2075,6 +2113,46 @@ def json_value( # type: ignore[empty-body] """Retrieve value of a JSON field as plain STRING.""" +@ibis_udf.scalar.builtin(name="INT64") +def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.INT64") +def cast_json_to_int64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value in the safe mode.""" + + +@ibis_udf.scalar.builtin(name="FLOAT64") +def cast_json_to_float64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.FLOAT64") +def cast_json_to_float64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="BOOL") +def cast_json_to_bool(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.BOOL") +def cast_json_to_bool_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="STRING") +def cast_json_to_string(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.STRING") +def cast_json_to_string_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + @ibis_udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body] """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 91d1fa0d85..1cb270297c 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -26,6 +26,7 @@ import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.core.ordering as bf_ordering +from bigframes.core.rewrite import schema_binding class SQLGlotCompiler: @@ -120,7 +121,14 @@ def _remap_variables(self, node: nodes.ResultNode) -> nodes.ResultNode: def _compile_result_node(self, root: nodes.ResultNode) -> str: sqlglot_ir = self.compile_node(root.child) - # TODO: add order_by, limit, and selections to sqlglot_expr + + selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple( + (name, scalar_compiler.compile_scalar_expression(ref)) + for ref, name in root.output_cols + ) + sqlglot_ir = sqlglot_ir.select(selected_cols) + + # TODO: add order_by, limit to sqlglot_expr return sqlglot_ir.sql @functools.lru_cache(maxsize=5000) @@ -176,6 +184,6 @@ def compile_projection( def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) - node = nodes.bottom_up(node, rewrite.rewrite_timedelta_expressions) + node = nodes.bottom_up(node, schema_binding.bind_schema_to_expressions) node = nodes.bottom_up(node, rewrite.rewrite_range_rolling) return node diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 48aa4c7b0b..0f059d482c 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -35,6 +35,13 @@ def compile_deref_expression(expr: expression.DerefOp) -> sge.Expression: return sge.ColumnDef(this=sge.to_identifier(expr.id.sql, quoted=True)) +@compile_scalar_expression.register +def compile_field_ref_expression( + expr: expression.SchemaFieldRefExpression, +) -> sge.Expression: + return sge.ColumnDef(this=sge.to_identifier(expr.field.id.sql, quoted=True)) + + @compile_scalar_expression.register def compile_constant_expression( expr: expression.ScalarConstantExpression, diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index b23349bcbc..23b441591b 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -17,6 +17,8 @@ import dataclasses import typing +from google.cloud import bigquery +import numpy as np import pyarrow as pa import sqlglot as sg import sqlglot.dialects.bigquery @@ -104,19 +106,44 @@ def from_pyarrow( ) return cls(expr=sg.select(sge.Star()).from_(expr), uid_gen=uid_gen) + @classmethod + def from_query_string( + cls, + query_string: str, + ) -> SQLGlotIR: + """Builds SQLGlot expression from a query string""" + uid_gen: guid.SequentialUIDGenerator = guid.SequentialUIDGenerator() + cte_name = sge.to_identifier( + next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted + ) + cte = sge.CTE( + this=query_string, + alias=cte_name, + ) + select_expr = sge.Select().select(sge.Star()).from_(sge.Table(this=cte_name)) + select_expr.set("with", sge.With(expressions=[cte])) + return cls(expr=select_expr, uid_gen=uid_gen) + def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], ) -> SQLGlotIR: - cols_expr = [ + selections = [ sge.Alias( this=expr, alias=sge.to_identifier(id, quoted=self.quoted), ) for id, expr in selected_cols ] - new_expr = self._encapsulate_as_cte().select(*cols_expr, append=False) - return SQLGlotIR(expr=new_expr) + # Attempts to simplify selected columns when the original and new column + # names are simply aliases of each other. + squashed_selections = _squash_selections(self.expr.expressions, selections) + if squashed_selections != []: + new_expr = self.expr.select(*squashed_selections, append=False) + return SQLGlotIR(expr=new_expr) + else: + new_expr = self._encapsulate_as_cte().select(*selections, append=False) + return SQLGlotIR(expr=new_expr) def project( self, @@ -133,6 +160,36 @@ def project( select_expr = self.expr.select(*projected_cols_expr, append=True) return SQLGlotIR(expr=select_expr) + def insert( + self, + destination: bigquery.TableReference, + ) -> str: + return sge.insert(self.expr.subquery(), _table(destination)).sql( + dialect=self.dialect, pretty=self.pretty + ) + + def replace( + self, + destination: bigquery.TableReference, + ) -> str: + # Workaround for SQLGlot breaking change: + # https://github.com/tobymao/sqlglot/pull/4495 + whens_expr = [ + sge.When(matched=False, source=True, then=sge.Delete()), + sge.When(matched=False, then=sge.Insert(this=sge.Var(this="ROW"))), + ] + whens_str = "\n".join( + when_expr.sql(dialect=self.dialect, pretty=self.pretty) + for when_expr in whens_expr + ) + + merge_str = sge.Merge( + this=_table(destination), + using=self.expr.subquery(), + on=_literal(False, dtypes.BOOL_DTYPE), + ).sql(dialect=self.dialect, pretty=self.pretty) + return f"{merge_str}\n{whens_str}" + def _encapsulate_as_cte( self, ) -> sge.Select: @@ -149,7 +206,7 @@ def _encapsulate_as_cte( this=select_expr, alias=new_cte_name, ) - new_with_clause = sge.With(expressions=existing_ctes + [new_cte]) + new_with_clause = sge.With(expressions=[*existing_ctes, new_cte]) new_select_expr = ( sge.Select().select(sge.Star()).from_(sge.Table(this=new_cte_name)) ) @@ -164,7 +221,11 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: elif dtype == dtypes.BYTES_DTYPE: return _cast(str(value), sqlglot_type) elif dtypes.is_time_like(dtype): + if isinstance(value, np.generic): + value = value.item() return _cast(sge.convert(value.isoformat()), sqlglot_type) + elif dtype in (dtypes.NUMERIC_DTYPE, dtypes.BIGNUMERIC_DTYPE): + return _cast(sge.convert(value), sqlglot_type) elif dtypes.is_geo_like(dtype): wkt = value if isinstance(value, str) else to_wkt(value) return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt)) @@ -185,8 +246,77 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: ) return values if len(value) > 0 else _cast(values, sqlglot_type) else: + if isinstance(value, np.generic): + value = value.item() return sge.convert(value) def _cast(arg: typing.Any, to: str) -> sge.Cast: return sge.Cast(this=arg, to=to) + + +def _table(table: bigquery.TableReference) -> sge.Table: + return sge.Table( + this=sg.to_identifier(table.table_id, quoted=True), + db=sg.to_identifier(table.dataset_id, quoted=True), + catalog=sg.to_identifier(table.project, quoted=True), + ) + + +def _squash_selections( + old_expr: list[sge.Expression], new_expr: list[sge.Alias] +) -> list[sge.Alias]: + """ + Simplifies the select column expressions if existing (old_expr) and + new (new_expr) selected columns are both simple aliases of column definitions. + + Example: + old_expr: [A AS X, B AS Y] + new_expr: [X AS P, Y AS Q] + Result: [A AS P, B AS Q] + """ + old_alias_map: typing.Dict[str, str] = {} + for selected in old_expr: + column_alias_pair = _get_column_alias_pair(selected) + if column_alias_pair is None: + return [] + else: + old_alias_map[column_alias_pair[1]] = column_alias_pair[0] + + new_selected_cols: typing.List[sge.Alias] = [] + for selected in new_expr: + column_alias_pair = _get_column_alias_pair(selected) + if column_alias_pair is None or column_alias_pair[0] not in old_alias_map: + return [] + else: + new_alias_expr = sge.Alias( + this=sge.ColumnDef( + this=sge.to_identifier( + old_alias_map[column_alias_pair[0]], quoted=True + ) + ), + alias=sg.to_identifier(column_alias_pair[1], quoted=True), + ) + new_selected_cols.append(new_alias_expr) + return new_selected_cols + + +def _get_column_alias_pair( + expr: sge.Expression, +) -> typing.Optional[typing.Tuple[str, str]]: + """Checks if an expression is a simple alias of a column definition + (e.g., "column_name AS alias_name"). + If it is, returns a tuple containing the alias name and original column name. + Returns `None` otherwise. + """ + if not isinstance(expr, sge.Alias): + return None + if not isinstance(expr.this, sge.ColumnDef): + return None + + column_def_expr: sge.ColumnDef = expr.this + if not isinstance(column_def_expr.this, sge.Identifier): + return None + + original_identifier: sge.Identifier = column_def_expr.this + return (original_identifier.this, expr.alias) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index afd290827d..238b588fea 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -16,14 +16,16 @@ import abc import dataclasses +import functools import itertools import typing from typing import Generator, Mapping, TypeVar, Union import pandas as pd +from bigframes import dtypes +from bigframes.core import field import bigframes.core.identifiers as ids -import bigframes.dtypes as dtypes import bigframes.operations import bigframes.operations.aggregations as agg_ops @@ -50,7 +52,7 @@ class Aggregation(abc.ABC): @abc.abstractmethod def output_type( - self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: ... @@ -72,7 +74,7 @@ class NullaryAggregation(Aggregation): op: agg_ops.NullaryWindowOp = dataclasses.field() def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: return self.op.output_type() @@ -86,13 +88,17 @@ def remap_column_refs( @dataclasses.dataclass(frozen=True) class UnaryAggregation(Aggregation): - op: agg_ops.UnaryWindowOp = dataclasses.field() - arg: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() + op: agg_ops.UnaryWindowOp + arg: Union[DerefOp, ScalarConstantExpression] def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: - return self.op.output_type(self.arg.output_type(input_types)) + # TODO(b/419300717) Remove resolutions once defers are cleaned up. + resolved_expr = bind_schema_fields(self.arg, input_fields) + assert resolved_expr.is_resolved + + return self.op.output_type(resolved_expr.output_type) @property def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: @@ -118,10 +124,16 @@ class BinaryAggregation(Aggregation): right: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + self, input_fields: Mapping[ids.ColumnId, field.Field] ) -> dtypes.ExpressionType: + # TODO(b/419300717) Remove resolutions once defers are cleaned up. + left_resolved_expr = bind_schema_fields(self.left, input_fields) + assert left_resolved_expr.is_resolved + right_resolved_expr = bind_schema_fields(self.right, input_fields) + assert right_resolved_expr.is_resolved + return self.op.output_type( - self.left.output_type(input_types), self.right.output_type(input_types) + left_resolved_expr.output_type, left_resolved_expr.output_type ) @property @@ -189,10 +201,17 @@ def remap_column_refs( def is_const(self) -> bool: ... + @property @abc.abstractmethod - def output_type( - self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] - ) -> dtypes.ExpressionType: + def is_resolved(self) -> bool: + """ + Returns true if and only if the expression's output type and nullability is available. + """ + ... + + @property + @abc.abstractmethod + def output_type(self) -> dtypes.ExpressionType: ... @abc.abstractmethod @@ -256,9 +275,12 @@ def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: def nullable(self) -> bool: return pd.isna(self.value) # type: ignore - def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] - ) -> dtypes.ExpressionType: + @property + def is_resolved(self) -> bool: + return True + + @property + def output_type(self) -> dtypes.ExpressionType: return self.dtype def bind_variables( @@ -308,9 +330,12 @@ def is_const(self) -> bool: def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: return () - def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] - ) -> dtypes.ExpressionType: + @property + def is_resolved(self): + return False + + @property + def output_type(self) -> dtypes.ExpressionType: raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_refs( @@ -340,7 +365,7 @@ def is_identity(self) -> bool: @dataclasses.dataclass(frozen=True) class DerefOp(Expression): - """A variable expression representing an unbound variable.""" + """An expression that refers to a column by ID.""" id: ids.ColumnId @@ -357,13 +382,13 @@ def nullable(self) -> bool: # Safe default, need to actually bind input schema to determine return True - def output_type( - self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] - ) -> dtypes.ExpressionType: - if self.id in input_types: - return input_types[self.id] - else: - raise ValueError(f"Type of variable {self.id} has not been fixed.") + @property + def is_resolved(self) -> bool: + return False + + @property + def output_type(self) -> dtypes.ExpressionType: + raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_variables( self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False @@ -390,6 +415,55 @@ def is_identity(self) -> bool: return True +@dataclasses.dataclass(frozen=True) +class SchemaFieldRefExpression(Expression): + """An expression representing a schema field. This is essentially a DerefOp with input schema bound.""" + + field: field.Field + + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return (self.field.id,) + + @property + def is_const(self) -> bool: + return False + + @property + def nullable(self) -> bool: + return self.field.nullable + + @property + def is_resolved(self) -> bool: + return True + + @property + def output_type(self) -> dtypes.ExpressionType: + return self.field.dtype + + def bind_variables( + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + ) -> Expression: + return self + + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, + ) -> Expression: + if self.field.id in bindings.keys(): + return bindings[self.field.id] + return self + + @property + def is_bijective(self) -> bool: + return True + + @property + def is_identity(self) -> bool: + return True + + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): """An expression representing a scalar operation applied to 1 or more argument sub-expressions.""" @@ -429,13 +503,18 @@ def nullable(self) -> bool: ) return not null_free - def output_type( - self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] - ) -> dtypes.ExpressionType: - operand_types = tuple( - map(lambda x: x.output_type(input_types=input_types), self.inputs) - ) - return self.op.output_type(*operand_types) + @functools.cached_property + def is_resolved(self) -> bool: + return all(input.is_resolved for input in self.inputs) + + @functools.cached_property + def output_type(self) -> dtypes.ExpressionType: + if not self.is_resolved: + raise ValueError(f"Type of expression {self.op.name} has not been fixed.") + + input_types = [input.output_type for input in self.inputs] + + return self.op.output_type(*input_types) def bind_variables( self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False @@ -475,4 +554,22 @@ def deterministic(self) -> bool: ) +def bind_schema_fields( + expr: Expression, field_by_id: Mapping[ids.ColumnId, field.Field] +) -> Expression: + """ + Updates `DerefOp` expressions by replacing column IDs with actual schema fields(columns). + + We can only deduct an expression's output type and nullability after binding schema fields to + all its deref expressions. + """ + if expr.is_resolved: + return expr + + expr_by_id = { + id: SchemaFieldRefExpression(field) for id, field in field_by_id.items() + } + return expr.bind_refs(expr_by_id) + + RefOrConstant = Union[DerefOp, ScalarConstantExpression] diff --git a/bigframes/core/field.py b/bigframes/core/field.py new file mode 100644 index 0000000000..c5b7dd3555 --- /dev/null +++ b/bigframes/core/field.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses + +from bigframes import dtypes +from bigframes.core import identifiers + + +@dataclasses.dataclass(frozen=True) +class Field: + id: identifiers.ColumnId + dtype: dtypes.Dtype + # Best effort, nullable=True if not certain + nullable: bool = True + + def with_nullable(self) -> Field: + return Field(self.id, self.dtype, nullable=True) + + def with_nonnull(self) -> Field: + return Field(self.id, self.dtype, nullable=False) + + def with_id(self, id: identifiers.ColumnId) -> Field: + return Field(id, self.dtype, nullable=self.nullable) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 6258eb00d5..c60e40880b 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -155,8 +155,8 @@ def __getitem__(self, key): # row key. We must choose one, so bias towards treating as multi-part row label if isinstance(key, tuple) and len(key) == 2: is_row_multi_index = self._dataframe.index.nlevels > 1 - is_first_item_tuple = isinstance(key[0], tuple) - if not is_row_multi_index or is_first_item_tuple: + is_first_item_list_or_tuple = isinstance(key[0], (tuple, list)) + if not is_row_multi_index or is_first_item_list_or_tuple: df = typing.cast( bigframes.dataframe.DataFrame, _loc_getitem_series_or_dataframe(self._dataframe, key[0]), diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 9b4b5e4290..44b1d9d4fa 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -145,12 +145,7 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): - new_block = self._block.with_index_labels(values) - if self._linked_frame is not None: - self._linked_frame._set_block( - self._linked_frame._block.with_index_labels(values) - ) - self._block = new_block + self.rename(values, inplace=True) @property def nlevels(self) -> int: @@ -411,11 +406,62 @@ def fillna(self, value=None) -> Index: ops.fillna_op.as_expr(ex.free_var("arg"), ex.const(value)) ) - def rename(self, name: Union[str, Sequence[str]]) -> Index: - names = [name] if isinstance(name, str) else list(name) + @overload + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + ) -> Index: + ... + + @overload + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + *, + inplace: Literal[False], + ) -> Index: + ... + + @overload + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + *, + inplace: Literal[True], + ) -> None: + ... + + def rename( + self, + name: Union[blocks.Label, Sequence[blocks.Label]], + *, + inplace: bool = False, + ) -> Optional[Index]: + # Tuples are allowed as a label, but we specifically exclude them here. + # This is because tuples are hashable, but we want to treat them as a + # sequence. If name is iterable, we want to assume we're working with a + # MultiIndex. Unfortunately, strings are iterable and we don't want a + # list of all the characters, so specifically exclude the non-tuple + # hashables. + if isinstance(name, blocks.Label) and not isinstance(name, tuple): + names = [name] + else: + names = list(name) + if len(names) != self.nlevels: raise ValueError("'name' must be same length as levels") - return Index(self._block.with_index_labels(names)) + + new_block = self._block.with_index_labels(names) + + if inplace: + if self._linked_frame is not None: + self._linked_frame._set_block( + self._linked_frame._block.with_index_labels(names) + ) + self._block = new_block + return None + else: + return Index(new_block) def drop( self, @@ -451,6 +497,12 @@ def drop_duplicates(self, *, keep: str = "first") -> Index: block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) return Index(block) + def unique(self, level: Hashable | int | None = None) -> Index: + if level is None: + return self.drop_duplicates() + + return self.get_level_values(level).drop_duplicates() + def isin(self, values) -> Index: if not utils.is_list_like(values): raise TypeError( diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index d23f3538dd..2e8c4aff44 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -54,8 +54,8 @@ def from_arrow(cls, table: pa.Table) -> LocalTableMetadata: @dataclasses.dataclass(frozen=True) class ManagedArrowTable: - data: pa.Table = dataclasses.field(hash=False) - schema: schemata.ArraySchema = dataclasses.field(hash=False) + data: pa.Table = dataclasses.field(hash=False, compare=False) + schema: schemata.ArraySchema = dataclasses.field(hash=False, compare=False) id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4) @functools.cached_property diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 8be46f531c..6021c7075a 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -15,7 +15,7 @@ import functools import inspect import threading -from typing import List +from typing import List, Optional from google.cloud import bigquery import pandas @@ -28,6 +28,7 @@ MAX_LABELS_COUNT = 64 - 8 PANDAS_API_TRACKING_TASK = "pandas_api_tracking" PANDAS_PARAM_TRACKING_TASK = "pandas_param_tracking" +LOG_OVERRIDE_NAME = "__log_override_name__" _api_methods: List = [] _excluded_methods = ["__setattr__", "__getattr__"] @@ -37,8 +38,8 @@ def submit_pandas_labels( - bq_client: bigquery.Client, - class_name: str, + bq_client: Optional[bigquery.Client], + base_name: str, method_name: str, args=(), kwargs={}, @@ -54,7 +55,7 @@ def submit_pandas_labels( Args: bq_client (bigquery.Client): The client used to interact with BigQuery. - class_name (str): The name of the pandas class being used. + base_name (str): The name of the pandas class/module being used. method_name (str): The name of the method being invoked. args (tuple): The positional arguments passed to the method. kwargs (dict): The keyword arguments passed to the method. @@ -63,25 +64,29 @@ def submit_pandas_labels( - 'PANDAS_PARAM_TRACKING_TASK': Indicates that the unimplemented feature is a parameter of a method. """ - if method_name.startswith("_") and not method_name.startswith("__"): + if bq_client is None or ( + method_name.startswith("_") and not method_name.startswith("__") + ): return labels_dict = { "task": task, - "class_name": class_name.lower(), + "class_name": base_name.lower(), "method_name": method_name.lower(), "args_count": len(args), } - if hasattr(pandas, class_name): - cls = getattr(pandas, class_name) + # getattr(pandas, "pandas") returns pandas + # so we can also use this for pandas.function + if hasattr(pandas, base_name): + base = getattr(pandas, base_name) else: return # Omit __call__, because its not implemented on the actual instances of # DataFrame/Series, only as the constructor. - if method_name != "__call__" and hasattr(cls, method_name): - method = getattr(cls, method_name) + if method_name != "__call__" and hasattr(base, method_name): + method = getattr(base, method_name) else: return @@ -110,30 +115,29 @@ def submit_pandas_labels( bq_client.query(query, job_config=job_config) -def class_logger(decorated_cls=None, /, *, include_internal_calls=False): +def class_logger(decorated_cls=None): """Decorator that adds logging functionality to each method of the class.""" def wrap(cls): for attr_name, attr_value in cls.__dict__.items(): if callable(attr_value) and (attr_name not in _excluded_methods): if isinstance(attr_value, staticmethod): - # TODO(b/390244171) support for staticmethod - pass + setattr( + cls, + attr_name, + staticmethod(method_logger(attr_value)), + ) else: setattr( cls, attr_name, - method_logger( - attr_value, - cls, - include_internal_calls, - ), + method_logger(attr_value), ) elif isinstance(attr_value, property): setattr( cls, attr_name, - property_logger(attr_value, cls, include_internal_calls), + property_logger(attr_value), ) return cls @@ -145,33 +149,39 @@ def wrap(cls): return wrap(decorated_cls) -def method_logger(method, decorated_cls, include_internal_calls: bool): +def method_logger(method, /, *, custom_base_name: Optional[str] = None): """Decorator that adds logging functionality to a method.""" @functools.wraps(method) - def wrapper(self, *args, **kwargs): - class_name = decorated_cls.__name__ # Access decorated class name - api_method_name = str(method.__name__) - full_method_name = f"{class_name.lower()}-{api_method_name}" - + def wrapper(*args, **kwargs): + api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) + if custom_base_name is None: + qualname_parts = getattr(method, "__qualname__", method.__name__).split(".") + class_name = qualname_parts[-2] if len(qualname_parts) > 1 else "" + base_name = ( + class_name if class_name else "_".join(method.__module__.split(".")[1:]) + ) + else: + base_name = custom_base_name + + full_method_name = f"{base_name.lower()}-{api_method_name}" # Track directly called methods - if len(_call_stack) == 0 or include_internal_calls: + if len(_call_stack) == 0: add_api_method(full_method_name) _call_stack.append(full_method_name) try: - return method(self, *args, **kwargs) + return method(*args, **kwargs) except (NotImplementedError, TypeError) as e: # Log method parameters that are implemented in pandas but either missing (TypeError) # or not fully supported (NotImplementedError) in BigFrames. # Logging is currently supported only when we can access the bqclient through - # self._block.expr.session.bqclient. Also, to avoid generating multiple queries - # because of internal calls, we log only when the method is directly invoked. - if hasattr(self, "_block") and len(_call_stack) == 1: + # _block.session.bqclient. + if len(_call_stack) == 1: submit_pandas_labels( - self._block.expr.session.bqclient, - class_name, + _get_bq_client(*args, **kwargs), + base_name, api_method_name, args, kwargs, @@ -184,22 +194,23 @@ def wrapper(self, *args, **kwargs): return wrapper -def property_logger(prop, decorated_cls, include_internal_calls: bool): +def property_logger(prop): """Decorator that adds logging functionality to a property.""" - def shared_wrapper(f): - @functools.wraps(f) + def shared_wrapper(prop): + @functools.wraps(prop) def wrapped(*args, **kwargs): - class_name = decorated_cls.__name__ - property_name = f.__name__ + qualname_parts = getattr(prop, "__qualname__", prop.__name__).split(".") + class_name = qualname_parts[-2] if len(qualname_parts) > 1 else "" + property_name = prop.__name__ full_property_name = f"{class_name.lower()}-{property_name.lower()}" - if len(_call_stack) == 0 or include_internal_calls: + if len(_call_stack) == 0: add_api_method(full_property_name) _call_stack.append(full_property_name) try: - return f(*args, **kwargs) + return prop(*args, **kwargs) finally: _call_stack.pop() @@ -213,12 +224,24 @@ def wrapped(*args, **kwargs): ) +def log_name_override(name: str): + """ + Attaches a custom name to be used by logger. + """ + + def wrapper(func): + setattr(func, LOG_OVERRIDE_NAME, name) + return func + + return wrapper + + def add_api_method(api_method_name): global _lock global _api_methods with _lock: # Push the method to the front of the _api_methods list - _api_methods.insert(0, api_method_name) + _api_methods.insert(0, api_method_name.replace("<", "").replace(">", "")) # Keep the list length within the maximum limit (adjust MAX_LABELS_COUNT as needed) _api_methods = _api_methods[:MAX_LABELS_COUNT] @@ -232,3 +255,17 @@ def get_and_reset_api_methods(dry_run: bool = False): if not dry_run: _api_methods.clear() return previous_api_methods + + +def _get_bq_client(*args, **kwargs): + # Assumes that on BigFrames API errors (TypeError/NotImplementedError), + # an input arg (likely the first, e.g., 'self') has `_block.session.bqclient` + for argv in args: + if hasattr(argv, "_block"): + return argv._block.session.bqclient + + for kwargv in kwargs.values(): + if hasattr(kwargv, "_block"): + return kwargv._block.session.bqclient + + return None diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 0fbfe7bd37..cc82c844f7 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -34,8 +34,9 @@ import google.cloud.bigquery as bq from bigframes.core import identifiers, local_data, sequences -from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field +from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET import bigframes.core.expression as ex +from bigframes.core.field import Field from bigframes.core.ordering import OrderingExpression, RowOrdering import bigframes.core.slices as slices import bigframes.core.window_spec as window @@ -578,6 +579,9 @@ class ScanItem(typing.NamedTuple): def with_id(self, id: identifiers.ColumnId) -> ScanItem: return ScanItem(id, self.dtype, self.source_id) + def with_source_id(self, source_id: str) -> ScanItem: + return ScanItem(self.id, self.dtype, source_id) + @dataclasses.dataclass(frozen=True) class ScanList: @@ -614,6 +618,21 @@ def project( result = ScanList((self.items[:1])) return result + def remap_source_ids( + self, + mapping: Mapping[str, str], + ) -> ScanList: + items = tuple( + item.with_source_id(mapping.get(item.source_id, item.source_id)) + for item in self.items + ) + return ScanList(items) + + def append( + self, source_id: str, dtype: bigframes.dtypes.Dtype, id: identifiers.ColumnId + ) -> ScanList: + return ScanList((*self.items, ScanItem(id, dtype, source_id))) + @dataclasses.dataclass(frozen=True, eq=False) class ReadLocalNode(LeafNode): @@ -621,9 +640,9 @@ class ReadLocalNode(LeafNode): local_data_source: local_data.ManagedArrowTable # Mapping of local ids to bfet id. scan_list: ScanList + session: bigframes.session.Session # Offsets are generated only if this is non-null offsets_col: Optional[identifiers.ColumnId] = None - session: typing.Optional[bigframes.session.Session] = None @property def fields(self) -> Sequence[Field]: @@ -1172,26 +1191,25 @@ class ProjectionNode(UnaryNode, AdditiveNode): assignments: typing.Tuple[typing.Tuple[ex.Expression, identifiers.ColumnId], ...] def _validate(self): - input_types = self.child._dtype_lookup - for expression, id in self.assignments: + for expression, _ in self.assignments: # throws TypeError if invalid - _ = expression.output_type(input_types) + _ = ex.bind_schema_fields(expression, self.child.field_by_id).output_type # Cannot assign to existing variables - append only! assert all(name not in self.child.schema.names for _, name in self.assignments) @functools.cached_property def added_fields(self) -> Tuple[Field, ...]: - input_types = self.child._dtype_lookup - fields = [] for expr, id in self.assignments: + bound_expr = ex.bind_schema_fields(expr, self.child.field_by_id) field = Field( id, - bigframes.dtypes.dtype_for_etype(expr.output_type(input_types)), - nullable=expr.nullable, + bigframes.dtypes.dtype_for_etype(bound_expr.output_type), + nullable=bound_expr.nullable, ) + # Special case until we get better nullability inference in expression objects themselves - if expr.is_identity and not any( + if bound_expr.is_identity and not any( self.child.field_by_id[id].nullable for id in expr.column_references ): field = field.with_nonnull() @@ -1282,7 +1300,7 @@ def fields(self) -> Sequence[Field]: Field( id, bigframes.dtypes.dtype_for_etype( - agg.output_type(self.child._dtype_lookup) + agg.output_type(self.child.field_by_id) ), nullable=True, ) @@ -1392,11 +1410,11 @@ def row_count(self) -> Optional[int]: @functools.cached_property def added_field(self) -> Field: - input_types = self.child._dtype_lookup + input_fields = self.child.field_by_id # TODO: Determine if output could be non-null return Field( self.output_name, - bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_types)), + bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_fields)), ) @property diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 98f175d300..59ccdf1f5f 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -37,9 +37,13 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" -def _field_to_template_value(name: str, value: Any) -> str: +def _field_to_template_value( + name: str, + value: Any, +) -> str: """Convert value to something embeddable in a SQL string.""" import bigframes.core.sql # Avoid circular imports + import bigframes.dataframe # Avoid circular imports _validate_type(name, value) @@ -47,20 +51,27 @@ def _field_to_template_value(name: str, value: Any) -> str: if isinstance(value, table_types): return _table_to_sql(value) - # TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery. + # TODO(tswast): convert pandas DataFrame objects to gbq tables or a literals subquery. + if isinstance(value, bigframes.dataframe.DataFrame): + return _table_to_sql(value._to_view()) + return bigframes.core.sql.simple_literal(value) def _validate_type(name: str, value: Any): """Raises TypeError if value is unsupported.""" import bigframes.core.sql # Avoid circular imports + import bigframes.dataframe # Avoid circular imports if value is None: return # None can't be used in isinstance, but is a valid literal. - supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args( - bigframes.core.sql.SIMPLE_LITERAL_TYPES + supported_types = ( + typing.get_args(_BQ_TABLE_TYPES) + + typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES) + + (bigframes.dataframe.DataFrame,) ) + if not isinstance(value, supported_types): raise TypeError( f"{name} has unsupported type: {type(value)}. " @@ -80,8 +91,6 @@ def pyformat( sql_template: str, *, pyformat_args: dict, - # TODO: add dry_run parameter to avoid expensive API calls in conversion - # TODO: and session to upload data / convert to table if necessary ) -> str: """Unsafe Python-style string formatting of SQL string. diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 069ebb9cdf..b8f1d26db8 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -22,7 +22,7 @@ try_reduce_to_local_scan, try_reduce_to_table_scan, ) -from bigframes.core.rewrite.slices import pull_up_limits, rewrite_slice +from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions from bigframes.core.rewrite.windows import rewrite_range_rolling @@ -32,6 +32,7 @@ "rewrite_slice", "rewrite_timedelta_expressions", "pull_up_limits", + "pull_out_limit", "remap_variables", "defer_order", "column_pruning", diff --git a/bigframes/core/rewrite/schema_binding.py b/bigframes/core/rewrite/schema_binding.py new file mode 100644 index 0000000000..f3c313233b --- /dev/null +++ b/bigframes/core/rewrite/schema_binding.py @@ -0,0 +1,49 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses + +from bigframes.core import bigframe_node +from bigframes.core import expression as ex +from bigframes.core import nodes + + +def bind_schema_to_expressions( + node: bigframe_node.BigFrameNode, +) -> bigframe_node.BigFrameNode: + if isinstance(node, nodes.ProjectionNode): + bound_assignments = tuple( + (ex.bind_schema_fields(expr, node.child.field_by_id), id) + for expr, id in node.assignments + ) + return dataclasses.replace(node, assignments=bound_assignments) + + if isinstance(node, nodes.FilterNode): + bound_predicate = ex.bind_schema_fields(node.predicate, node.child.field_by_id) + return dataclasses.replace(node, predicate=bound_predicate) + + if isinstance(node, nodes.OrderByNode): + bound_bys = [] + for by in node.by: + bound_by = dataclasses.replace( + by, + scalar_expression=ex.bind_schema_fields( + by.scalar_expression, node.child.field_by_id + ), + ) + bound_bys.append(bound_by) + + return dataclasses.replace(node, by=tuple(bound_bys)) + + return node diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index b8a003e061..92911310da 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -26,7 +26,7 @@ def pull_up_limits(root: nodes.ResultNode) -> nodes.ResultNode: - new_child, pulled_limit = _pullup_slice_inner(root.child) + new_child, pulled_limit = pull_out_limit(root.child) if new_child == root.child: return root elif pulled_limit is None: @@ -37,7 +37,7 @@ def pull_up_limits(root: nodes.ResultNode) -> nodes.ResultNode: return dataclasses.replace(root, child=new_child, limit=new_limit) -def _pullup_slice_inner( +def pull_out_limit( root: nodes.BigFrameNode, ) -> Tuple[nodes.BigFrameNode, Optional[int]]: """ @@ -53,7 +53,7 @@ def _pullup_slice_inner( assert root.step == 1 assert root.stop is not None limit = root.stop - new_root, prior_limit = _pullup_slice_inner(root.child) + new_root, prior_limit = pull_out_limit(root.child) if (prior_limit is not None) and (prior_limit < limit): limit = prior_limit return new_root, limit @@ -61,7 +61,7 @@ def _pullup_slice_inner( isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode)) and root.row_preserving ): - new_child, prior_limit = _pullup_slice_inner(root.child) + new_child, prior_limit = pull_out_limit(root.child) if prior_limit is not None: return root.transform_children(lambda _: new_child), prior_limit # Most ops don't support pulling up slice, like filter, agg, join, etc. diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index 82df53af82..baf4b12566 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -45,26 +45,13 @@ def can_fast_head(node: nodes.BigFrameNode) -> bool: # To do fast head operation: # (1) the underlying data must be arranged/indexed according to the logical ordering # (2) transformations must support pushing down LIMIT or a filter on row numbers - return has_fast_offset_address(node) or has_fast_offset_address(node) - - -def has_fast_orderby_limit(node: nodes.BigFrameNode) -> bool: - """True iff ORDER BY LIMIT can be performed without a large full table scan.""" - # TODO: In theory compatible with some Slice nodes, potentially by adding OFFSET - if isinstance(node, nodes.LeafNode): - return node.fast_ordered_limit - if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): - return has_fast_orderby_limit(node.child) - return False - - -def has_fast_offset_address(node: nodes.BigFrameNode) -> bool: - """True iff specific offsets can be scanned without a large full table scan.""" - # TODO: In theory can push offset lookups through slice operators by translating indices - if isinstance(node, nodes.LeafNode): - return node.fast_offsets + if isinstance(node, nodes.ReadLocalNode): + # always cheap to push slice into local data + return True + if isinstance(node, nodes.ReadTableNode): + return (node.source.ordering is None) or (node.fast_ordered_limit) if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): - return has_fast_offset_address(node.child) + return can_fast_head(node.child) return False diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8ed749138c..1d0d485392 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -27,6 +27,7 @@ from typing import ( Callable, Dict, + Hashable, Iterable, List, Literal, @@ -46,6 +47,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +from pandas.api import extensions as pd_ext import pandas.io.formats.format import pyarrow import tabulate @@ -394,6 +396,19 @@ def astype( return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) + def _should_sql_have_index(self) -> bool: + """Should the SQL we pass to BQML and other I/O include the index?""" + + return self._has_index and ( + self.index.name is not None or len(self.index.names) > 1 + ) + + def _to_view(self) -> bigquery.TableReference: + """Compiles this DataFrame's expression tree to SQL and saves it to a + (temporary) view. + """ + return self._block.to_view(include_index=self._should_sql_have_index()) + def _to_sql_query( self, include_index: bool, enable_cache: bool = True ) -> Tuple[str, list[str], list[blocks.Label]]: @@ -420,9 +435,7 @@ def sql(self) -> str: string representing the compiled SQL. """ try: - include_index = self._has_index and ( - self.index.name is not None or len(self.index.names) > 1 - ) + include_index = self._should_sql_have_index() sql, _, _ = self._to_sql_query(include_index=include_index) return sql except AttributeError as e: @@ -2069,15 +2082,67 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: return self._block.index.resolve_level(level) + @overload def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame: + ... + + @overload + def rename( + self, *, columns: Mapping[blocks.Label, blocks.Label], inplace: Literal[False] + ) -> DataFrame: + ... + + @overload + def rename( + self, *, columns: Mapping[blocks.Label, blocks.Label], inplace: Literal[True] + ) -> None: + ... + + def rename( + self, *, columns: Mapping[blocks.Label, blocks.Label], inplace: bool = False + ) -> Optional[DataFrame]: block = self._block.rename(columns=columns) - return DataFrame(block) + if inplace: + self._block = block + return None + else: + return DataFrame(block) + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + ) -> DataFrame: + ... + + @overload def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[False], **kwargs, ) -> DataFrame: + ... + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: bool = False, + **kwargs, + ) -> Optional[DataFrame]: if len(kwargs) != 0: raise NotImplementedError( f"rename_axis does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" @@ -2087,7 +2152,14 @@ def rename_axis( labels = mapper else: labels = [mapper] - return DataFrame(self._block.with_index_labels(labels)) + + block = self._block.with_index_labels(labels) + + if inplace: + self._block = block + return None + else: + return DataFrame(block) @validations.requires_ordering() def equals(self, other: typing.Union[bigframes.series.Series, DataFrame]) -> bool: @@ -2913,9 +2985,23 @@ def nunique(self) -> bigframes.series.Series: return bigframes.series.Series(block) def agg( - self, func: str | typing.Sequence[str] + self, + func: str + | typing.Sequence[str] + | typing.Mapping[blocks.Label, typing.Sequence[str] | str], ) -> DataFrame | bigframes.series.Series: - if utils.is_list_like(func): + if utils.is_dict_like(func): + # Must check dict-like first because dictionaries are list-like + # according to Pandas. + agg_cols = [] + for col_label, agg_func in func.items(): + agg_cols.append(self[col_label].agg(agg_func)) + + from bigframes.core.reshape import api as reshape + + return reshape.concat(agg_cols, axis=1) + + elif utils.is_list_like(func): aggregations = [agg_ops.lookup_agg_func(f) for f in func] for dtype, agg in itertools.product(self.dtypes, aggregations): @@ -2929,6 +3015,7 @@ def agg( aggregations, ) ) + else: return bigframes.series.Series( self._block.aggregate_all_and_stack( @@ -3597,6 +3684,47 @@ def _groupby_series( def abs(self) -> DataFrame: return self._apply_unary_op(ops.abs_op) + def round(self, decimals: Union[int, dict[Hashable, int]] = 0) -> DataFrame: + is_mapping = utils.is_dict_like(decimals) + if not (is_mapping or isinstance(decimals, int)): + raise TypeError("'decimals' must be either a dict-like or integer.") + block = self._block + exprs = [] + for label, col_id, dtype in zip( + block.column_labels, block.value_columns, block.dtypes + ): + if dtype in set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) - { + bigframes.dtypes.BOOL_DTYPE + }: + if is_mapping: + if label in decimals: # type: ignore + exprs.append( + ops.round_op.as_expr( + col_id, + ex.const( + decimals[label], dtype=bigframes.dtypes.INT_DTYPE # type: ignore + ), + ) + ) + else: + exprs.append(ex.deref(col_id)) + else: + exprs.append( + ops.round_op.as_expr( + col_id, + ex.const( + typing.cast(int, decimals), + dtype=bigframes.dtypes.INT_DTYPE, + ), + ) + ) + else: + exprs.append(ex.deref(col_id)) + + return DataFrame( + block.project_exprs(exprs, labels=block.column_labels, drop=True) + ) + def isna(self) -> DataFrame: return self._apply_unary_op(ops.isnull_op) @@ -4029,7 +4157,7 @@ def to_numpy( self, dtype=None, copy=False, - na_value=None, + na_value=pd_ext.no_default, *, allow_large_results=None, **kwargs, diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index ec115a93d0..262fa9dde7 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -499,33 +499,6 @@ def bigframes_dtype_to_arrow_dtype( ) -def bigframes_dtype_to_literal( - bigframes_dtype: Dtype, -) -> Any: - """Create a representative literal value for a bigframes dtype. - - The inverse of infer_literal_type(). - """ - if isinstance(bigframes_dtype, pd.ArrowDtype): - arrow_type = bigframes_dtype.pyarrow_dtype - return arrow_type_to_literal(arrow_type) - - if isinstance(bigframes_dtype, pd.Float64Dtype): - return 1.0 - if isinstance(bigframes_dtype, pd.Int64Dtype): - return 1 - if isinstance(bigframes_dtype, pd.BooleanDtype): - return True - if isinstance(bigframes_dtype, pd.StringDtype): - return "string" - if isinstance(bigframes_dtype, gpd.array.GeometryDtype): - return shapely.geometry.Point((0, 0)) - - raise TypeError( - f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" - ) - - def arrow_type_to_literal( arrow_type: pa.DataType, ) -> Any: diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index ece950a5a2..3ff32d2433 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -360,5 +360,12 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE. - return self._bqml_model.evaluate() + if X is not None and y is not None: + X, y = utils.batch_convert_to_dataframe( + X, y, session=self._bqml_model.session + ) + input_data = X.join(y, how="outer") + else: + input_data = X + + return self._bqml_model.evaluate(input_data) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index ef74c4ac55..11861c786e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -112,11 +112,18 @@ "If you proceed with '{model_name}', it might not work as expected or could lead to errors with multimodal inputs." ) +_MODEL_DEPRECATE_WARNING = ( + "'{model_name}' is going to be deprecated. Use '{new_model_name}' ({link}) instead." +) + @log_adapter.class_logger class TextEmbeddingGenerator(base.RetriableRemotePredictor): """Text embedding generator LLM model. + .. note:: + text-embedding-004 is going to be deprecated. Use text-embedding-005(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. + Args: model_name (str, Default to "text-embedding-004"): The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004" @@ -169,6 +176,15 @@ def _create_bqml_model(self): ) ) warnings.warn(msg) + if self.model_name == "text-embedding-004": + msg = exceptions.format_message( + _MODEL_DEPRECATE_WARNING.format( + model_name=self.model_name, + new_model_name="text-embedding-005", + link="https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator", + ) + ) + warnings.warn(msg) options = { "endpoint": self.model_name, @@ -416,6 +432,7 @@ class GeminiTextGenerator(base.RetriableRemotePredictor): default and a warning will be issued. .. note:: + "gemini-1.5-X" is going to be deprecated. Please use gemini-2.0-X instead. For example, "gemini-2.0-flash-001". "gemini-2.0-flash-exp", "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions @@ -461,10 +478,12 @@ def __init__( "(https://cloud.google.com/products#product-launch-stages)." ) warnings.warn(msg, category=exceptions.PreviewWarning) + if model_name is None: model_name = "gemini-2.0-flash-001" msg = exceptions.format_message(_REMOVE_DEFAULT_MODEL_WARNING) warnings.warn(msg, category=FutureWarning, stacklevel=2) + self.model_name = model_name self.session = session or global_session.get_global_session() self.max_iterations = max_iterations @@ -487,6 +506,15 @@ def _create_bqml_model(self): ) ) warnings.warn(msg) + if self.model_name.startswith("gemini-1.5"): + msg = exceptions.format_message( + _MODEL_DEPRECATE_WARNING.format( + model_name=self.model_name, + new_model_name="gemini-2.0-X", + link="https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator", + ) + ) + warnings.warn(msg) options = {"endpoint": self.model_name} @@ -529,7 +557,8 @@ def fit( y: utils.ArrayType, ) -> GeminiTextGenerator: """Fine tune GeminiTextGenerator model. Only support "gemini-1.5-pro-002", - and "gemini-1.5-flash-002" models for now. + "gemini-1.5-flash-002", "gemini-2.0-flash-001", + and "gemini-2.0-flash-lite-001"models for now. .. note:: @@ -549,7 +578,7 @@ def fit( """ if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: msg = exceptions.format_message( - "fit() only supports gemini-1.5-pro-002, or gemini-1.5-flash-002 model." + "fit() only supports gemini-1.5-pro-002, gemini-1.5-flash-002, gemini-2.0-flash-001, or gemini-2.0-flash-lite-001 model." ) warnings.warn(msg) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e5da674a8c..3e97ec6f4a 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -106,6 +106,7 @@ JSONExtract, JSONExtractArray, JSONExtractStringArray, + JSONQuery, JSONSet, JSONValue, ParseJSON, @@ -355,6 +356,7 @@ "JSONExtract", "JSONExtractArray", "JSONExtractStringArray", + "JSONQuery", "JSONSet", "JSONValue", "ParseJSON", diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 9d73fd43c1..c65947f53f 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import re import typing -from typing import List, Optional +from typing import Dict, List, Optional import warnings import numpy as np @@ -34,7 +36,13 @@ def __init__(self, df) -> None: self._df: bigframes.dataframe.DataFrame = df - def filter(self, instruction: str, model, ground_with_google_search: bool = False): + def filter( + self, + instruction: str, + model, + ground_with_google_search: bool = False, + attach_logprobs: bool = False, + ): """ Filters the DataFrame with the semantics of the user instruction. @@ -74,6 +82,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + Returns: bigframes.pandas.DataFrame: DataFrame filtered by the instruction. @@ -82,72 +94,27 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ - import bigframes.dataframe - import bigframes.series - self._validate_model(model) - columns = self._parse_columns(instruction) - for column in columns: - if column not in self._df.columns: - raise ValueError(f"Column {column} not found.") + answer_col = "answer" - if ground_with_google_search: - msg = exceptions.format_message( - "Enables Grounding with Google Search may impact billing cost. See pricing " - "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" - ) - warnings.warn(msg, category=UserWarning) - - self._confirm_operation(len(self._df)) - - df: bigframes.dataframe.DataFrame = self._df[columns].copy() - has_blob_column = False - for column in columns: - if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string - has_blob_column = True - continue - - if df[column].dtype != dtypes.STRING_DTYPE: - df[column] = df[column].astype(dtypes.STRING_DTYPE) - - user_instruction = self._format_instruction(instruction, columns) - output_instruction = "Based on the provided context, reply to the following claim by only True or False:" - - if has_blob_column: - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - df, - prompt=self._make_multimodel_prompt( - df, columns, user_instruction, output_instruction - ), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) - else: - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - self._make_text_prompt( - df, columns, user_instruction, output_instruction - ), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) + output_schema = {answer_col: "bool"} + result = self.map( + instruction, + model, + output_schema, + ground_with_google_search, + attach_logprobs, + ) - return self._df[ - results["ml_generate_text_llm_result"].str.lower().str.contains("true") - ] + return result[result[answer_col]].drop(answer_col, axis=1) def map( self, instruction: str, - output_column: str, model, + output_schema: Dict[str, str] | None = None, ground_with_google_search: bool = False, + attach_logprobs=False, ): """ Maps the DataFrame with the semantics of the user instruction. @@ -163,7 +130,7 @@ def map( >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) - >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"}) ingredient_1 ingredient_2 food 0 Burger Bun Beef Patty Burger @@ -180,12 +147,14 @@ def map( in the instructions like: "Get the ingredients of {food}." - output_column (str): - The column name of the mapping result. - model (bigframes.ml.llm.GeminiTextGenerator): A GeminiTextGenerator provided by Bigframes ML package. + output_schema (Dict[str, str] or None, default None): + The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of :. + Supported types are int64, float64, bool, string, array and struct. If None, generate string result under the column + "ml_generate_text_llm_result". + ground_with_google_search (bool, default False): Enables Grounding with Google Search for the GeminiTextGenerator model. When set to True, the model incorporates relevant information from Google @@ -194,6 +163,11 @@ def map( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + + Returns: bigframes.pandas.DataFrame: DataFrame with attached mapping results. @@ -236,6 +210,9 @@ def map( "Based on the provided contenxt, answer the following instruction:" ) + if output_schema is None: + output_schema = {"ml_generate_text_llm_result": "string"} + if has_blob_column: results = typing.cast( bigframes.series.Series, @@ -246,7 +223,8 @@ def map( ), temperature=0.0, ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], + output_schema=output_schema, + ), ) else: results = typing.cast( @@ -257,12 +235,28 @@ def map( ), temperature=0.0, ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], + output_schema=output_schema, + ), + ) + + attach_columns = [results[col] for col, _ in output_schema.items()] + + def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series: + from bigframes import bigquery as bbq + + logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0] + logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype( + "Float64" ) + logprobs.name = "logprob" + return logprobs + + if attach_logprobs: + attach_columns.append(extract_logprob(results["full_response"])) from bigframes.core.reshape.api import concat - return concat([self._df, results.rename(output_column)], axis=1) + return concat([self._df, *attach_columns], axis=1) def join( self, @@ -270,6 +264,7 @@ def join( instruction: str, model, ground_with_google_search: bool = False, + attach_logprobs=False, ): """ Joines two dataframes by applying the instruction over each pair of rows from @@ -313,10 +308,6 @@ def join( model (bigframes.ml.llm.GeminiTextGenerator): A GeminiTextGenerator provided by Bigframes ML package. - max_rows (int, default 1000): - The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method - call will end early with an error. - ground_with_google_search (bool, default False): Enables Grounding with Google Search for the GeminiTextGenerator model. When set to True, the model incorporates relevant information from Google @@ -325,6 +316,10 @@ def join( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + Returns: bigframes.pandas.DataFrame: The joined dataframe. @@ -400,7 +395,10 @@ def join( joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right")) return joined_df.ai.filter( - instruction, model, ground_with_google_search=ground_with_google_search + instruction, + model, + ground_with_google_search=ground_with_google_search, + attach_logprobs=attach_logprobs, ).reset_index(drop=True) def search( diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 8d70596b7d..c316d28321 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -245,9 +245,9 @@ def _align( ) return (typing.cast(ex.DerefOp, values[0]), values[1], block) - def _align3(self, other1: series.Series | scalars.Scalar, other2: series.Series | scalars.Scalar, how="left") -> tuple[ex.DerefOp, AlignedExprT, AlignedExprT, blocks.Block]: # type: ignore + def _align3(self, other1: series.Series | scalars.Scalar, other2: series.Series | scalars.Scalar, how="left", cast_scalars: bool = True) -> tuple[ex.DerefOp, AlignedExprT, AlignedExprT, blocks.Block]: # type: ignore """Aligns the series value with 2 other scalars or series objects. Returns new values and joined tabled expression.""" - values, index = self._align_n([other1, other2], how) + values, index = self._align_n([other1, other2], how, cast_scalars=cast_scalars) return ( typing.cast(ex.DerefOp, values[0]), values[1], @@ -260,7 +260,7 @@ def _align_n( others: typing.Sequence[typing.Union[series.Series, scalars.Scalar]], how="outer", ignore_self=False, - cast_scalars: bool = True, + cast_scalars: bool = False, ) -> tuple[ typing.Sequence[Union[ex.ScalarConstantExpression, ex.DerefOp]], blocks.Block, diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index c9ce633cae..b083035d38 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -134,3 +134,18 @@ def output_type(self, *input_types): + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE + + +@dataclasses.dataclass(frozen=True) +class JSONQuery(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_query" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return input_type diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 9d6749a169..b9820cd0ea 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -289,7 +289,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) round_op = base_ops.create_binary_op( - name="round", type_signature=op_typing.BINARY_REAL_NUMERIC + name="round", type_signature=op_typing.BINARY_NUMERIC ) unsafe_pow_op = base_ops.create_binary_op( diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 8c5c54e8ca..9fa5450748 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -57,7 +57,7 @@ def agg( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame( ... { @@ -68,7 +68,7 @@ def agg( ... ], ... "Year": [1997, 2013, 2010], ... }) - >>> df.semantics.agg( + >>> df.semantics.agg( # doctest: +SKIP ... "Find the first name shared by all actors in {Movies}. One word answer.", ... model=model, ... ) @@ -252,7 +252,7 @@ def cluster_by( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator() + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") >>> df = bpd.DataFrame({ ... "Product": ["Smartphone", "Laptop", "T-shirt", "Jeans"], @@ -326,10 +326,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) - >>> df.semantics.filter("{city} is the capital of {country}", model) + >>> df.semantics.filter("{city} is the capital of {country}", model) # doctest: +SKIP country city 1 Germany Berlin @@ -440,10 +440,10 @@ def map( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) - >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) # doctest: +SKIP ingredient_1 ingredient_2 food 0 Burger Bun Beef Patty Burger @@ -563,12 +563,12 @@ def join( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) - >>> cities.semantics.join(continents, "{city} is in {continent}", model) + >>> cities.semantics.join(continents, "{city} is in {continent}", model) # doctest: +SKIP city continent 0 Seattle North America 1 Ottawa North America @@ -704,10 +704,10 @@ def search( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") # doctest: +SKIP >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) - >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') + >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') # doctest: +SKIP creatures distance 3 chimpanzee 0.635844 @@ -805,14 +805,14 @@ def top_k( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # doctest: +SKIP >>> df = bpd.DataFrame( ... { ... "Animals": ["Dog", "Bird", "Cat", "Horse"], ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], ... }) - >>> df.semantics.top_k("{Animals} are more popular as pets", model=model, k=2) + >>> df.semantics.top_k("{Animals} are more popular as pets", model=model, k=2) # doctest: +SKIP Animals Sounds 0 Dog Woof 2 Cat Meow @@ -1006,12 +1006,12 @@ def sim_join( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") # doctest: +SKIP >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) - >>> df1.semantics.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + >>> df1.semantics.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) # doctest: +SKIP animal animal_1 0 monkey baboon 1 spider scorpion diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index a8430b0b0e..9022a1665e 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -15,7 +15,7 @@ from __future__ import annotations import re -from typing import cast, Literal, Optional, Union +from typing import Literal, Optional, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr @@ -230,21 +230,26 @@ def replace( flags: int = 0, regex: bool = False, ) -> series.Series: - is_compiled = isinstance(pat, re.Pattern) - patstr = cast(str, pat.pattern if is_compiled else pat) # type: ignore + if isinstance(pat, re.Pattern): + assert isinstance(pat.pattern, str) + pat_str = pat.pattern + flags = pat.flags | flags + else: + pat_str = pat + if case is False: - return self.replace(pat, repl, flags=flags | re.IGNORECASE, regex=True) + return self.replace(pat_str, repl, flags=flags | re.IGNORECASE, regex=True) if regex: re2flags = _parse_flags(flags) if re2flags: - patstr = re2flags + patstr - return self._apply_unary_op(ops.RegexReplaceStrOp(pat=patstr, repl=repl)) + pat_str = re2flags + pat_str + return self._apply_unary_op(ops.RegexReplaceStrOp(pat=pat_str, repl=repl)) else: - if is_compiled: + if isinstance(pat, re.Pattern): raise ValueError( "Must set 'regex'=True if using compiled regex pattern." ) - return self._apply_unary_op(ops.ReplaceStrOp(pat=patstr, repl=repl)) + return self._apply_unary_op(ops.ReplaceStrOp(pat=pat_str, repl=repl)) def startswith( self, @@ -318,10 +323,15 @@ def to_blob(self, connection: Optional[str] = None) -> series.Series: def _parse_flags(flags: int) -> Optional[str]: re2flags = [] for reflag, re2flag in REGEXP_FLAGS.items(): - if flags & flags: + if flags & reflag: re2flags.append(re2flag) flags = flags ^ reflag + # re2 handles unicode fine by default + # most compiled re in python will have unicode set + if re.U and flags: + flags = flags ^ re.U + # Remaining flags couldn't be mapped to re2 engine if flags: raise NotImplementedError( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index e90f123778..d08ef4e91d 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -27,6 +27,7 @@ import pandas import bigframes._config as config +from bigframes.core import log_adapter import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes @@ -199,6 +200,7 @@ def get_default_session_id() -> str: return get_global_session().session_id +@log_adapter.method_logger def clean_up_by_session_id( session_id: str, location: Optional[str] = None, @@ -245,7 +247,6 @@ def clean_up_by_session_id( session.bqclient, location=location, project=project, - api_name="clean_up_by_session_id", ) bigframes.session._io.bigquery.delete_tables_matching_session_id( @@ -322,31 +323,33 @@ def reset_session(): except Exception: pass -# Use __all__ to let type checkers know what is part of the public API. -__all__ = [ - # Functions - "clean_up_by_session_id", - "concat", - "cut", - "get_default_session_id", - "get_dummies", - "merge", - "qcut", - "read_csv", - "read_gbq", - "read_gbq_function", - "read_gbq_model", - "read_gbq_object_table", - "read_gbq_query", - "read_gbq_table", - "read_json", - "read_pandas", - "read_parquet", - "read_pickle", - "remote_function", - "to_datetime", - "to_timedelta", - "from_glob_path", +_functions = [ + clean_up_by_session_id, + concat, + cut, + get_default_session_id, + get_dummies, + merge, + qcut, + read_csv, + read_gbq, + read_gbq_function, + read_gbq_model, + read_gbq_object_table, + read_gbq_query, + read_gbq_table, + read_json, + read_pandas, + read_parquet, + read_pickle, + remote_function, + to_datetime, + to_timedelta, + from_glob_path, +] + +_function_names = [_function.__name__ for _function in _functions] +_other_names = [ # pandas dtype attributes "NA", "BooleanDtype", @@ -371,3 +374,12 @@ def reset_session(): "reset_session", "udf", ] + +# Use __all__ to let type checkers know what is part of the public API. +__all__ = _function_names + _other_names + +_module = sys.modules[__name__] + +for _function in _functions: + _decorated_object = log_adapter.method_logger(_function, custom_base_name="pandas") + setattr(_module, _function.__name__, _decorated_object) diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index ecf8a59bb7..b2ce5f211e 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -15,6 +15,7 @@ from __future__ import annotations import inspect +import threading import typing from typing import ( Any, @@ -44,11 +45,8 @@ ) import bigframes._config as config -import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes -import bigframes.core.reshape -import bigframes.core.tools import bigframes.dataframe import bigframes.enums import bigframes.series @@ -468,6 +466,8 @@ def from_glob_path( from_glob_path.__doc__ = inspect.getdoc(bigframes.session.Session.from_glob_path) +_default_location_lock = threading.Lock() + def _set_default_session_location_if_possible(query): # Set the location as per the query if this is the first query the user is @@ -478,31 +478,35 @@ def _set_default_session_location_if_possible(query): # If query is a table name, then it would be the location of the table. # If query is a SQL with a table, then it would be table's location. # If query is a SQL with no table, then it would be the BQ default location. - if ( - config.options.bigquery._session_started - or config.options.bigquery.location - or config.options.bigquery.use_regional_endpoints - ): - return - - clients_provider = bigframes.session.clients.ClientsProvider( - project=config.options.bigquery.project, - location=config.options.bigquery.location, - use_regional_endpoints=config.options.bigquery.use_regional_endpoints, - credentials=config.options.bigquery.credentials, - application_name=config.options.bigquery.application_name, - bq_kms_key_name=config.options.bigquery.kms_key_name, - client_endpoints_override=config.options.bigquery.client_endpoints_override, - ) - - bqclient = clients_provider.bqclient - - if bigframes.session._io.bigquery.is_query(query): - # Intentionally run outside of the session so that we can detect the - # location before creating the session. Since it's a dry_run, labels - # aren't necessary. - job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) - config.options.bigquery.location = job.location - else: - table = bqclient.get_table(query) - config.options.bigquery.location = table.location + global _default_location_lock + + with _default_location_lock: + if ( + config.options.bigquery._session_started + or config.options.bigquery.location + or config.options.bigquery.use_regional_endpoints + ): + return + + clients_provider = bigframes.session.clients.ClientsProvider( + project=config.options.bigquery.project, + location=config.options.bigquery.location, + use_regional_endpoints=config.options.bigquery.use_regional_endpoints, + credentials=config.options.bigquery.credentials, + application_name=config.options.bigquery.application_name, + bq_kms_key_name=config.options.bigquery.kms_key_name, + client_endpoints_override=config.options.bigquery.client_endpoints_override, + requests_transport_adapters=config.options.bigquery.requests_transport_adapters, + ) + + bqclient = clients_provider.bqclient + + if bigframes.session._io.bigquery.is_query(query): + # Intentionally run outside of the session so that we can detect the + # location before creating the session. Since it's a dry_run, labels + # aren't necessary. + job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) + config.options.bigquery.location = job.location + else: + table = bqclient.get_table(query) + config.options.bigquery.location = table.location diff --git a/bigframes/series.py b/bigframes/series.py index 1e29671310..74e8d03c8d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -31,6 +31,7 @@ Literal, Mapping, Optional, + overload, Sequence, Tuple, Union, @@ -42,6 +43,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +from pandas.api import extensions as pd_ext import pandas.core.dtypes.common import pyarrow as pa import typing_extensions @@ -94,6 +96,10 @@ class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Ser # Must be above 5000 for pandas to delegate to bigframes for binops __pandas_priority__ = 13000 + # Ensure mypy can more robustly determine the type of self._block since it + # gets set in various places. + _block: blocks.Block + def __init__(self, *args, **kwargs): self._query_job: Optional[bigquery.QueryJob] = None super().__init__(*args, **kwargs) @@ -253,22 +259,45 @@ def __iter__(self) -> typing.Iterator: def copy(self) -> Series: return Series(self._block) + @overload def rename( - self, index: Union[blocks.Label, Mapping[Any, Any]] = None, **kwargs + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + ) -> Series: + ... + + @overload + def rename( + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + *, + inplace: Literal[False], + **kwargs, ) -> Series: + ... + + @overload + def rename( + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + *, + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + def rename( + self, + index: Union[blocks.Label, Mapping[Any, Any]] = None, + *, + inplace: bool = False, + **kwargs, + ) -> Optional[Series]: if len(kwargs) != 0: raise NotImplementedError( f"rename does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" ) - # rename the Series name - if index is None or isinstance( - index, str - ): # Python 3.9 doesn't allow isinstance of Optional - index = typing.cast(Optional[str], index) - block = self._block.with_column_labels([index]) - return Series(block) - # rename the index if isinstance(index, Mapping): index = typing.cast(Mapping[Any, Any], index) @@ -293,22 +322,61 @@ def rename( block = block.set_index(new_idx_ids, index_labels=block.index.names) - return Series(block) + if inplace: + self._block = block + return None + else: + return Series(block) # rename the Series name if isinstance(index, typing.Hashable): + # Python 3.9 doesn't allow isinstance of Optional index = typing.cast(Optional[str], index) block = self._block.with_column_labels([index]) - return Series(block) + + if inplace: + self._block = block + return None + else: + return Series(block) raise ValueError(f"Unsupported type of parameter index: {type(index)}") - @validations.requires_index + @overload def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + ) -> Series: + ... + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[False], **kwargs, ) -> Series: + ... + + @overload + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @validations.requires_index + def rename_axis( + self, + mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + inplace: bool = False, + **kwargs, + ) -> Optional[Series]: if len(kwargs) != 0: raise NotImplementedError( f"rename_axis does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" @@ -318,7 +386,13 @@ def rename_axis( labels = mapper else: labels = [mapper] - return Series(self._block.with_index_labels(labels)) + + block = self._block.with_index_labels(labels) + if inplace: + self._block = block + return None + else: + return Series(block) def equals( self, other: typing.Union[Series, bigframes.dataframe.DataFrame] @@ -1353,14 +1427,17 @@ def where(self, cond, other=None): ) return Series(block.select_column(result_id).with_column_labels([self.name])) - def clip(self, lower, upper): + def clip(self, lower=None, upper=None): if lower is None and upper is None: return self if lower is None: return self._apply_binary_op(upper, ops.minimum_op, alignment="left") if upper is None: return self._apply_binary_op(lower, ops.maximum_op, alignment="left") - value_id, lower_id, upper_id, block = self._align3(lower, upper) + # special rule to coerce scalar string args to date + value_id, lower_id, upper_id, block = self._align3( + lower, upper, cast_scalars=(bigframes.dtypes.is_date_like(self.dtype)) + ) block, result_id = block.project_expr( ops.clip_op.as_expr(value_id, lower_id, upper_id), ) @@ -1492,7 +1569,7 @@ def __getattr__(self, key: str): raise AttributeError(key) elif hasattr(pandas.Series, key): log_adapter.submit_pandas_labels( - self._block.expr.session.bqclient, self.__class__.__name__, key + self._block.session.bqclient, self.__class__.__name__, key ) raise AttributeError( textwrap.dedent( @@ -2106,7 +2183,7 @@ def to_numpy( self, dtype=None, copy=False, - na_value=None, + na_value=pd_ext.no_default, *, allow_large_results=None, **kwargs, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 17945f0be6..c24dca554a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -60,7 +60,8 @@ from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients -from bigframes.core import blocks +import bigframes.constants +from bigframes.core import blocks, log_adapter, utils import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -104,6 +105,7 @@ logger = logging.getLogger(__name__) +@log_adapter.class_logger class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, @@ -170,6 +172,7 @@ def __init__( application_name=context.application_name, bq_kms_key_name=self._bq_kms_key_name, client_endpoints_override=context.client_endpoints_override, + requests_transport_adapters=context.requests_transport_adapters, ) # TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494 @@ -247,13 +250,6 @@ def __init__( self._temp_storage_manager = ( self._session_resource_manager or self._anon_dataset_manager ) - self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( - bqclient=self._clients_provider.bqclient, - bqstoragereadclient=self._clients_provider.bqstoragereadclient, - storage_manager=self._temp_storage_manager, - strictly_ordered=self._strictly_ordered, - metrics=self._metrics, - ) self._loader = bigframes.session.loader.GbqDataLoader( session=self, bqclient=self._clients_provider.bqclient, @@ -264,6 +260,14 @@ def __init__( force_total_order=self._strictly_ordered, metrics=self._metrics, ) + self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( + bqclient=self._clients_provider.bqclient, + bqstoragereadclient=self._clients_provider.bqstoragereadclient, + loader=self._loader, + storage_manager=self._temp_storage_manager, + strictly_ordered=self._strictly_ordered, + metrics=self._metrics, + ) def __del__(self): """Automatic cleanup of internal resources.""" @@ -445,7 +449,6 @@ def read_gbq( columns=columns, configuration=configuration, max_results=max_results, - api_name="read_gbq", use_cache=use_cache, filters=filters, dry_run=dry_run, @@ -463,7 +466,6 @@ def read_gbq( index_col=index_col, columns=columns, max_results=max_results, - api_name="read_gbq", use_cache=use_cache if use_cache is not None else True, filters=filters, dry_run=dry_run, @@ -497,6 +499,7 @@ def _read_gbq_colab( ) -> pandas.Series: ... + @log_adapter.log_name_override("read_gbq_colab") def _read_gbq_colab( self, query: str, @@ -527,13 +530,11 @@ def _read_gbq_colab( query = bigframes.core.pyformat.pyformat( query, pyformat_args=pyformat_args, - # TODO: add dry_run parameter to avoid API calls for data in pyformat_args ) return self._loader.read_gbq_query( query=query, index_col=bigframes.enums.DefaultIndexKind.NULL, - api_name="read_gbq_colab", force_total_order=False, dry_run=typing.cast(Union[Literal[False], Literal[True]], dry_run), ) @@ -654,7 +655,6 @@ def read_gbq_query( columns=columns, configuration=configuration, max_results=max_results, - api_name="read_gbq_query", use_cache=use_cache, filters=filters, dry_run=dry_run, @@ -737,7 +737,6 @@ def read_gbq_table( index_col=index_col, columns=columns, max_results=max_results, - api_name="read_gbq_table", use_cache=use_cache, filters=filters, dry_run=dry_run, @@ -773,7 +772,6 @@ def read_gbq_table_streaming( df = self._loader.read_gbq_table( table, - api_name="read_gbq_table_steaming", enable_snapshot=False, index_col=bigframes.enums.DefaultIndexKind.NULL, ) @@ -906,7 +904,6 @@ def read_pandas( if isinstance(pandas_dataframe, pandas.Series): bf_df = self._read_pandas( pandas.DataFrame(pandas_dataframe), - "read_pandas", write_engine=write_engine, ) bf_series = series.Series(bf_df._block) @@ -916,13 +913,10 @@ def read_pandas( if isinstance(pandas_dataframe, pandas.Index): return self._read_pandas( pandas.DataFrame(index=pandas_dataframe), - "read_pandas", write_engine=write_engine, ).index if isinstance(pandas_dataframe, pandas.DataFrame): - return self._read_pandas( - pandas_dataframe, "read_pandas", write_engine=write_engine - ) + return self._read_pandas(pandas_dataframe, write_engine=write_engine) else: raise ValueError( f"read_pandas() expects a pandas.DataFrame, but got a {type(pandas_dataframe)}" @@ -931,7 +925,6 @@ def read_pandas( def _read_pandas( self, pandas_dataframe: pandas.DataFrame, - api_name: str, *, write_engine: constants.WriteEngineType = "default", ) -> dataframe.DataFrame: @@ -947,29 +940,27 @@ def _read_pandas( if write_engine == "default": write_engine = ( "bigquery_load" - if mem_usage > MAX_INLINE_DF_BYTES + if mem_usage > bigframes.constants.MAX_INLINE_BYTES else "bigquery_inline" ) if write_engine == "bigquery_inline": - if mem_usage > MAX_INLINE_DF_BYTES: + if mem_usage > bigframes.constants.MAX_INLINE_BYTES: raise ValueError( f"DataFrame size ({mem_usage} bytes) exceeds the maximum allowed " - f"for inline data ({MAX_INLINE_DF_BYTES} bytes)." + f"for inline data ({bigframes.constants.MAX_INLINE_BYTES} bytes)." ) return self._read_pandas_inline(pandas_dataframe) elif write_engine == "bigquery_load": - return self._loader.read_pandas( - pandas_dataframe, method="load", api_name=api_name - ) + return self._loader.read_pandas(pandas_dataframe, method="load") elif write_engine == "bigquery_streaming": - return self._loader.read_pandas( - pandas_dataframe, method="stream", api_name=api_name - ) + return self._loader.read_pandas(pandas_dataframe, method="stream") elif write_engine == "bigquery_write": - return self._loader.read_pandas( - pandas_dataframe, method="write", api_name=api_name - ) + return self._loader.read_pandas(pandas_dataframe, method="write") + elif write_engine == "_deferred": + import bigframes.dataframe as dataframe + + return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe, self)) else: raise ValueError(f"Got unexpected write_engine '{write_engine}'") @@ -1097,7 +1088,7 @@ def _read_csv_w_pandas_engines( encoding=encoding, **kwargs, ) - return self._read_pandas(pandas_df, api_name="read_csv", write_engine=write_engine) # type: ignore + return self._read_pandas(pandas_df, write_engine=write_engine) # type: ignore def _read_csv_w_bigquery_engine( self, @@ -1118,11 +1109,8 @@ def _read_csv_w_bigquery_engine( native CSV loading capabilities, making it suitable for large datasets that may not fit into local memory. """ - if dtype is not None: - raise NotImplementedError( - f"BigQuery engine does not support the `dtype` argument." - f"{constants.FEEDBACK_LINK}" - ) + if dtype is not None and not utils.is_dict_like(dtype): + raise ValueError("dtype should be a dict-like object.") if names is not None: if len(names) != len(set(names)): @@ -1177,10 +1165,16 @@ def _read_csv_w_bigquery_engine( job_config.skip_leading_rows = header + 1 table_id = self._loader.load_file(filepath_or_buffer, job_config=job_config) - return self._loader.read_gbq_table( + df = self._loader.read_gbq_table( table_id, index_col=index_col, columns=columns, names=names ) + if dtype is not None: + for column, dtype in dtype.items(): + if column in df.columns: + df[column] = df[column].astype(dtype) + return df + def read_pickle( self, filepath_or_buffer: FilePath | ReadPickleBuffer, @@ -1198,11 +1192,9 @@ def read_pickle( if isinstance(pandas_obj, pandas.Series): if pandas_obj.name is None: pandas_obj.name = 0 - bigframes_df = self._read_pandas(pandas_obj.to_frame(), "read_pickle") + bigframes_df = self._read_pandas(pandas_obj.to_frame()) return bigframes_df[bigframes_df.columns[0]] - return self._read_pandas( - pandas_obj, api_name="read_pickle", write_engine=write_engine - ) + return self._read_pandas(pandas_obj, write_engine=write_engine) def read_parquet( self, @@ -1248,9 +1240,7 @@ def read_parquet( engine=engine, # type: ignore **read_parquet_kwargs, ) - return self._read_pandas( - pandas_obj, api_name="read_parquet", write_engine=write_engine - ) + return self._read_pandas(pandas_obj, write_engine=write_engine) def read_json( self, @@ -1329,9 +1319,7 @@ def read_json( engine=engine, **kwargs, ) - return self._read_pandas( - pandas_df, api_name="read_json", write_engine=write_engine - ) + return self._read_pandas(pandas_df, write_engine=write_engine) def _check_file_size(self, filepath: str): max_size = 1024 * 1024 * 1024 # 1 GB in bytes @@ -1959,6 +1947,10 @@ def _create_object_table(self, path: str, connection: str) -> str: return table + def _create_temp_view(self, sql: str) -> bigquery.TableReference: + """Create a random id Object Table from the input path and connection.""" + return self._anon_dataset_manager.create_temp_view(sql) + def from_glob_path( self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None ) -> dataframe.DataFrame: @@ -1990,9 +1982,7 @@ def from_glob_path( table = self._create_object_table(path, connection) - s = self._loader.read_gbq_table(table, api_name="from_glob_path")[ - "uri" - ].str.to_blob(connection) + s = self._loader.read_gbq_table(table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() def _create_bq_connection( @@ -2045,9 +2035,7 @@ def read_gbq_object_table( table = self.bqclient.get_table(object_table) connection = table._properties["externalDataConfiguration"]["connectionId"] - s = self._loader.read_gbq_table(object_table, api_name="read_gbq_object_table")[ - "uri" - ].str.to_blob(connection) + s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 48268d925d..267111afe0 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -48,7 +48,6 @@ def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], api_methods: typing.List[str], - api_name: Optional[str] = None, ) -> Dict[str, str]: if job_configs_labels is None: job_configs_labels = {} @@ -58,9 +57,6 @@ def create_job_configs_labels( for key, value in bigframes.options.compute.extra_query_labels.items(): job_configs_labels[key] = value - if api_name is not None: - job_configs_labels["bigframes-api"] = api_name - if api_methods and "bigframes-api" not in job_configs_labels: job_configs_labels["bigframes-api"] = api_methods[0] del api_methods[0] @@ -143,6 +139,28 @@ def create_temp_table( return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" +def create_temp_view( + bqclient: bigquery.Client, + table_ref: bigquery.TableReference, + *, + expiration: datetime.datetime, + sql: str, +) -> str: + """Create an empty table with an expiration in the desired session. + + The table will be deleted when the session is closed or the expiration + is reached. + """ + destination = bigquery.Table(table_ref) + destination.expires = expiration + destination.view_query = sql + + # Ok if already exists, since this will only happen from retries internal to this method + # as the requested table id has a random UUID4 component. + bqclient.create_table(destination, exists_ok=True) + return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" + + def set_table_expiration( bqclient: bigquery.Client, table_ref: bigquery.TableReference, @@ -202,7 +220,7 @@ def format_option(key: str, value: Union[bool, str]) -> str: return f"{key}={repr(value)}" -def add_and_trim_labels(job_config, api_name: Optional[str] = None): +def add_and_trim_labels(job_config): """ Add additional labels to the job configuration and trim the total number of labels to ensure they do not exceed the maximum limit allowed by BigQuery, which is 64 @@ -212,7 +230,6 @@ def add_and_trim_labels(job_config, api_name: Optional[str] = None): job_config.labels = create_job_configs_labels( job_configs_labels=job_config.labels, api_methods=api_methods, - api_name=api_name, ) @@ -223,7 +240,6 @@ def start_query_with_client( location: Optional[str] = None, project: Optional[str] = None, timeout: Optional[float] = None, - api_name: Optional[str] = None, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, *, query_with_job: bool = True, @@ -234,7 +250,7 @@ def start_query_with_client( try: # Note: Ensure no additional labels are added to job_config after this point, # as `add_and_trim_labels` ensures the label count does not exceed 64. - add_and_trim_labels(job_config, api_name=api_name) + add_and_trim_labels(job_config) if not query_with_job: results_iterator = bq_client.query_and_wait( sql, @@ -308,7 +324,6 @@ def create_bq_dataset_reference( bq_client: bigquery.Client, location=None, project=None, - api_name: str = "unknown", ) -> bigquery.DatasetReference: """Create and identify dataset(s) for temporary BQ resources. @@ -337,7 +352,6 @@ def create_bq_dataset_reference( location=location, job_config=job_config, project=project, - api_name=api_name, ) # The anonymous dataset is used by BigQuery to write query results and diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 6b1cb99c65..2dff16933f 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -165,7 +165,6 @@ def infer_unique_columns( bqclient: bigquery.Client, table: bigquery.table.Table, index_cols: List[str], - api_name: str, metadata_only: bool = False, ) -> Tuple[str, ...]: """Return a set of columns that can provide a unique row key or empty if none can be inferred. @@ -187,7 +186,6 @@ def infer_unique_columns( # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) job_config = bigquery.QueryJobConfig() - job_config.labels["bigframes-api"] = api_name results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) row = next(iter(results)) diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index c5808aa63c..bc785f693f 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -43,7 +43,6 @@ def __init__( self.dataset = bf_io_bigquery.create_bq_dataset_reference( self.bqclient, location=self._location, - api_name="session-__init__", ) self.session_id = session_id @@ -54,6 +53,12 @@ def __init__( def location(self): return self._location + def _default_expiration(self): + """When should the table expire automatically?""" + return ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) + def create_temp_table( self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] ) -> bigquery.TableReference: @@ -61,9 +66,7 @@ def create_temp_table( Allocates and and creates a table in the anonymous dataset. The table will be cleaned up by clean_up_tables. """ - expiration = ( - datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION - ) + expiration = self._default_expiration() table = bf_io_bigquery.create_temp_table( self.bqclient, self.allocate_temp_table(), @@ -74,6 +77,20 @@ def create_temp_table( ) return bigquery.TableReference.from_string(table) + def create_temp_view(self, sql: str) -> bigquery.TableReference: + """ + Allocates and and creates a view in the anonymous dataset. + The view will be cleaned up by clean_up_tables. + """ + expiration = self._default_expiration() + table = bf_io_bigquery.create_temp_view( + self.bqclient, + self.allocate_temp_table(), + expiration=expiration, + sql=sql, + ) + return bigquery.TableReference.from_string(table) + def allocate_temp_table(self) -> bigquery.TableReference: """ Allocates a unique table id, but does not create the table. diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index ae8dc88d43..883087df07 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -84,7 +84,9 @@ def create_temp_table( ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" - job = self.bqclient.query(ddl, job_config=job_config) + job = self.bqclient.query( + ddl, job_config=job_config, location=self.location + ) job.result() # return the fully qualified table, so it can be used outside of the session return job.destination @@ -94,7 +96,10 @@ def close(self): self._sessiondaemon.stop() if self._session_id is not None and self.bqclient is not None: - self.bqclient.query_and_wait(f"CALL BQ.ABORT_SESSION('{self._session_id}')") + self.bqclient.query_and_wait( + f"CALL BQ.ABORT_SESSION('{self._session_id}')", + location=self.location, + ) def _get_session_id(self) -> str: if self._session_id: diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 9288fdf641..33d3314a1e 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -17,6 +17,7 @@ import dataclasses import math import os +import threading from typing import cast, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings import weakref @@ -27,16 +28,19 @@ import google.cloud.bigquery.table as bq_table import google.cloud.bigquery_storage_v1 +import bigframes.constants import bigframes.core -from bigframes.core import compile, rewrite +from bigframes.core import compile, local_data, rewrite +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.guid import bigframes.core.nodes as nodes import bigframes.core.ordering as order +import bigframes.core.schema as schemata import bigframes.core.tree_properties as tree_properties import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.features -from bigframes.session import executor, local_scan_executor, read_api_execution +from bigframes.session import executor, loader, local_scan_executor, read_api_execution import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics import bigframes.session.planner @@ -65,12 +69,19 @@ def _get_default_output_spec() -> OutputSpec: ) +SourceIdMapping = Mapping[str, str] + + class ExecutionCache: def __init__(self): # current assumption is only 1 cache of a given node # in future, might have multiple caches, with different layout, localities self._cached_executions: weakref.WeakKeyDictionary[ - nodes.BigFrameNode, nodes.BigFrameNode + nodes.BigFrameNode, nodes.CachedTableNode + ] = weakref.WeakKeyDictionary() + self._uploaded_local_data: weakref.WeakKeyDictionary[ + local_data.ManagedArrowTable, + tuple[nodes.BigqueryDataSource, SourceIdMapping], ] = weakref.WeakKeyDictionary() @property @@ -103,6 +114,19 @@ def cache_results_table( assert original_root.schema == cached_replacement.schema self._cached_executions[original_root] = cached_replacement + def cache_remote_replacement( + self, + local_data: local_data.ManagedArrowTable, + bq_data: nodes.BigqueryDataSource, + ): + # bq table has one extra column for offsets, those are implicit for local data + assert len(local_data.schema.items) + 1 == len(bq_data.table.physical_schema) + mapping = { + local_data.schema.items[i].column: bq_data.table.physical_schema[i].name + for i in range(len(local_data.schema)) + } + self._uploaded_local_data[local_data] = (bq_data, mapping) + class BigQueryCachingExecutor(executor.Executor): """Computes BigFrames values using BigQuery Engine. @@ -118,6 +142,7 @@ def __init__( bqclient: bigquery.Client, storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager, bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, + loader: loader.GbqDataLoader, *, strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, @@ -127,6 +152,7 @@ def __init__( self.strictly_ordered: bool = strictly_ordered self.cache: ExecutionCache = ExecutionCache() self.metrics = metrics + self.loader = loader self.bqstoragereadclient = bqstoragereadclient # Simple left-to-right precedence for now self._semi_executors = ( @@ -136,6 +162,7 @@ def __init__( ), local_scan_executor.LocalScanExecutor(), ) + self._upload_lock = threading.Lock() def to_sql( self, @@ -147,6 +174,7 @@ def to_sql( if offset_column: array_value, _ = array_value.promote_offsets() node = self.logical_plan(array_value.node) if enable_cache else array_value.node + node = self._substitute_large_local_sources(node) compiled = compile.compile_sql(compile.CompileRequest(node, sort_rows=ordered)) return compiled.sql @@ -206,17 +234,45 @@ def export_gbq( if bigframes.options.compute.enable_multi_query_execution: self._simplify_with_caching(array_value) - dispositions = { - "fail": bigquery.WriteDisposition.WRITE_EMPTY, - "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, - "append": bigquery.WriteDisposition.WRITE_APPEND, - } + table_exists = True + try: + table = self.bqclient.get_table(destination) + if if_exists == "fail": + raise ValueError(f"Table already exists: {destination.__str__()}") + except google.api_core.exceptions.NotFound: + table_exists = False + + if len(cluster_cols) != 0: + if table_exists and table.clustering_fields != cluster_cols: + raise ValueError( + "Table clustering fields cannot be changed after the table has " + f"been created. Existing clustering fields: {table.clustering_fields}" + ) + sql = self.to_sql(array_value, ordered=False) - job_config = bigquery.QueryJobConfig( - write_disposition=dispositions[if_exists], - destination=destination, - clustering_fields=cluster_cols if cluster_cols else None, - ) + if table_exists and _if_schema_match(table.schema, array_value.schema): + # b/409086472: Uses DML for table appends and replacements to avoid + # BigQuery `RATE_LIMIT_EXCEEDED` errors, as per quota limits: + # https://cloud.google.com/bigquery/quotas#standard_tables + job_config = bigquery.QueryJobConfig() + ir = sqlglot_ir.SQLGlotIR.from_query_string(sql) + if if_exists == "append": + sql = ir.insert(destination) + else: # for "replace" + assert if_exists == "replace" + sql = ir.replace(destination) + else: + dispositions = { + "fail": bigquery.WriteDisposition.WRITE_EMPTY, + "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, + "append": bigquery.WriteDisposition.WRITE_APPEND, + } + job_config = bigquery.QueryJobConfig( + write_disposition=dispositions[if_exists], + destination=destination, + clustering_fields=cluster_cols if cluster_cols else None, + ) + # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. _, query_job = self._run_execute_query( @@ -263,7 +319,6 @@ def export_gcs( self.bqclient, export_data_statement, job_config=bigquery.QueryJobConfig(), - api_name=f"dataframe-to_{format.lower()}", metrics=self.metrics, ) return query_job @@ -313,7 +368,6 @@ def _run_execute_query( self, sql: str, job_config: Optional[bq_job.QueryJobConfig] = None, - api_name: Optional[str] = None, query_with_job: bool = True, ) -> Tuple[bq_table.RowIterator, Optional[bigquery.QueryJob]]: """ @@ -333,7 +387,6 @@ def _run_execute_query( self.bqclient, sql, job_config=job_config, - api_name=api_name, metrics=self.metrics, query_with_job=query_with_job, ) @@ -375,6 +428,7 @@ def _cache_with_cluster_cols( ): """Executes the query and uses the resulting table to rewrite future executions.""" plan = self.logical_plan(array_value.node) + plan = self._substitute_large_local_sources(plan) compiled = compile.compile_sql( compile.CompileRequest( plan, sort_rows=False, materialize_all_order_keys=True @@ -395,7 +449,7 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): w_offsets, offset_column = array_value.promote_offsets() compiled = compile.compile_sql( compile.CompileRequest( - self.logical_plan(w_offsets.node), + self.logical_plan(self._substitute_large_local_sources(w_offsets.node)), sort_rows=False, ) ) @@ -477,7 +531,6 @@ def _sql_as_cached_temp_table( _, query_job = self._run_execute_query( sql, job_config=job_config, - api_name="cached", ) assert query_job is not None query_job.result() @@ -506,6 +559,54 @@ def _validate_result_schema( f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" ) + def _substitute_large_local_sources(self, original_root: nodes.BigFrameNode): + """ + Replace large local sources with the uploaded version of those datasources. + """ + # Step 1: Upload all previously un-uploaded data + for leaf in original_root.unique_nodes(): + if isinstance(leaf, nodes.ReadLocalNode): + if ( + leaf.local_data_source.metadata.total_bytes + > bigframes.constants.MAX_INLINE_BYTES + ): + self._upload_local_data(leaf.local_data_source) + + # Step 2: Replace local scans with remote scans + def map_local_scans(node: nodes.BigFrameNode): + if not isinstance(node, nodes.ReadLocalNode): + return node + if node.local_data_source not in self.cache._uploaded_local_data: + return node + bq_source, source_mapping = self.cache._uploaded_local_data[ + node.local_data_source + ] + scan_list = node.scan_list.remap_source_ids(source_mapping) + # offsets_col isn't part of ReadTableNode, so emulate by adding to end of scan_list + if node.offsets_col is not None: + # Offsets are always implicitly the final column of uploaded data + # See: Loader.load_data + scan_list = scan_list.append( + bq_source.table.physical_schema[-1].name, + bigframes.dtypes.INT_DTYPE, + node.offsets_col, + ) + return nodes.ReadTableNode(bq_source, scan_list, node.session) + + return original_root.bottom_up(map_local_scans) + + def _upload_local_data(self, local_table: local_data.ManagedArrowTable): + if local_table in self.cache._uploaded_local_data: + return + # Lock prevents concurrent repeated work, but slows things down. + # Might be better as a queue and a worker thread + with self._upload_lock: + if local_table not in self.cache._uploaded_local_data: + uploaded = self.loader.load_data( + local_table, bigframes.core.guid.generate_guid() + ) + self.cache.cache_remote_replacement(local_table, uploaded) + def _execute_plan( self, plan: nodes.BigFrameNode, @@ -536,6 +637,8 @@ def _execute_plan( # Use explicit destination to avoid 10GB limit of temporary table if destination_table is not None: job_config.destination = destination_table + + plan = self._substitute_large_local_sources(plan) compiled = compile.compile_sql( compile.CompileRequest(plan, sort_rows=ordered, peek_count=peek) ) @@ -576,6 +679,21 @@ def _execute_plan( ) +def _if_schema_match( + table_schema: Tuple[bigquery.SchemaField, ...], schema: schemata.ArraySchema +) -> bool: + if len(table_schema) != len(schema.items): + return False + for field in table_schema: + if field.name not in schema.names: + return False + if bigframes.dtypes.convert_schema_field(field)[1] != schema.get_type( + field.name + ): + return False + return True + + def _sanitize( schema: Tuple[bigquery.SchemaField, ...] ) -> Tuple[bigquery.SchemaField, ...]: diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 5ef974d565..d680b94b8a 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -15,20 +15,22 @@ """Clients manages the connection to Google APIs.""" import os +import threading import typing -from typing import Optional +from typing import Optional, Sequence, Tuple import google.api_core.client_info import google.api_core.client_options -import google.api_core.exceptions import google.api_core.gapic_v1.client_info import google.auth.credentials +import google.auth.transport.requests import google.cloud.bigquery as bigquery import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 import pydata_google_auth +import requests import bigframes.constants import bigframes.version @@ -79,11 +81,18 @@ def __init__( application_name: Optional[str] = None, bq_kms_key_name: Optional[str] = None, client_endpoints_override: dict = {}, + *, + requests_transport_adapters: Sequence[ + Tuple[str, requests.adapters.BaseAdapter] + ] = (), ): credentials_project = None if credentials is None: credentials, credentials_project = _get_default_credentials_with_project() + # Ensure an access token is available. + credentials.refresh(google.auth.transport.requests.Request()) + # Prefer the project in this order: # 1. Project explicitly specified by the user # 2. Project set in the environment @@ -121,25 +130,37 @@ def __init__( ) self._location = location self._use_regional_endpoints = use_regional_endpoints + self._requests_transport_adapters = requests_transport_adapters self._credentials = credentials self._bq_kms_key_name = bq_kms_key_name self._client_endpoints_override = client_endpoints_override # cloud clients initialized for lazy load + self._bqclient_lock = threading.Lock() self._bqclient = None + + self._bqconnectionclient_lock = threading.Lock() self._bqconnectionclient: Optional[ google.cloud.bigquery_connection_v1.ConnectionServiceClient ] = None + + self._bqstoragereadclient_lock = threading.Lock() self._bqstoragereadclient: Optional[ google.cloud.bigquery_storage_v1.BigQueryReadClient ] = None + + self._bqstoragewriteclient_lock = threading.Lock() self._bqstoragewriteclient: Optional[ google.cloud.bigquery_storage_v1.BigQueryWriteClient ] = None + + self._cloudfunctionsclient_lock = threading.Lock() self._cloudfunctionsclient: Optional[ google.cloud.functions_v2.FunctionServiceClient ] = None + + self._resourcemanagerclient_lock = threading.Lock() self._resourcemanagerclient: Optional[ google.cloud.resourcemanager_v3.ProjectsClient ] = None @@ -159,13 +180,30 @@ def _create_bigquery_client(self): user_agent=self._application_name ) + requests_session = google.auth.transport.requests.AuthorizedSession( + self._credentials + ) + for prefix, adapter in self._requests_transport_adapters: + requests_session.mount(prefix, adapter) + bq_client = bigquery.Client( client_info=bq_info, client_options=bq_options, - credentials=self._credentials, project=self._project, location=self._location, + # Instead of credentials, use _http so that users can override + # requests options with transport adapters. See internal issue + # b/419106112. + _http=requests_session, ) + + # If a new enough client library is available, we opt-in to the faster + # backend behavior. This only affects code paths where query_and_wait is + # used, which doesn't expose a query job directly. See internal issue + # b/417985981. + if hasattr(bq_client, "default_job_creation_mode"): + bq_client.default_job_creation_mode = "JOB_CREATION_OPTIONAL" + if self._bq_kms_key_name: # Note: Key configuration only applies automatically to load and query jobs, not copy jobs. encryption_config = bigquery.EncryptionConfiguration( @@ -186,114 +224,126 @@ def _create_bigquery_client(self): @property def bqclient(self): - if not self._bqclient: - self._bqclient = self._create_bigquery_client() + with self._bqclient_lock: + if not self._bqclient: + self._bqclient = self._create_bigquery_client() return self._bqclient @property def bqconnectionclient(self): - if not self._bqconnectionclient: - bqconnection_options = None - if "bqconnectionclient" in self._client_endpoints_override: - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqconnectionclient"] - ) + with self._bqconnectionclient_lock: + if not self._bqconnectionclient: + bqconnection_options = None + if "bqconnectionclient" in self._client_endpoints_override: + bqconnection_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override[ + "bqconnectionclient" + ] + ) - bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqconnectionclient = ( - google.cloud.bigquery_connection_v1.ConnectionServiceClient( - client_info=bqconnection_info, - client_options=bqconnection_options, - credentials=self._credentials, + bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqconnectionclient = ( + google.cloud.bigquery_connection_v1.ConnectionServiceClient( + client_info=bqconnection_info, + client_options=bqconnection_options, + credentials=self._credentials, + ) ) - ) return self._bqconnectionclient @property def bqstoragereadclient(self): - if not self._bqstoragereadclient: - bqstorage_options = None - if "bqstoragereadclient" in self._client_endpoints_override: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqstoragereadclient"] - ) - elif self._use_regional_endpoints: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( - location=self._location + with self._bqstoragereadclient_lock: + if not self._bqstoragereadclient: + bqstorage_options = None + if "bqstoragereadclient" in self._client_endpoints_override: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override[ + "bqstoragereadclient" + ] + ) + elif self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) ) - ) - bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqstoragereadclient = ( - google.cloud.bigquery_storage_v1.BigQueryReadClient( - client_info=bqstorage_info, - client_options=bqstorage_options, - credentials=self._credentials, + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqstoragereadclient = ( + google.cloud.bigquery_storage_v1.BigQueryReadClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) ) - ) return self._bqstoragereadclient @property def bqstoragewriteclient(self): - if not self._bqstoragewriteclient: - bqstorage_options = None - if "bqstoragewriteclient" in self._client_endpoints_override: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqstoragewriteclient"] - ) - elif self._use_regional_endpoints: - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( - location=self._location + with self._bqstoragewriteclient_lock: + if not self._bqstoragewriteclient: + bqstorage_options = None + if "bqstoragewriteclient" in self._client_endpoints_override: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override[ + "bqstoragewriteclient" + ] + ) + elif self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) ) - ) - bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqstoragewriteclient = ( - google.cloud.bigquery_storage_v1.BigQueryWriteClient( - client_info=bqstorage_info, - client_options=bqstorage_options, - credentials=self._credentials, + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._bqstoragewriteclient = ( + google.cloud.bigquery_storage_v1.BigQueryWriteClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) ) - ) return self._bqstoragewriteclient @property def cloudfunctionsclient(self): - if not self._cloudfunctionsclient: - functions_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._cloudfunctionsclient = ( - google.cloud.functions_v2.FunctionServiceClient( - client_info=functions_info, - credentials=self._credentials, + with self._cloudfunctionsclient_lock: + if not self._cloudfunctionsclient: + functions_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._cloudfunctionsclient = ( + google.cloud.functions_v2.FunctionServiceClient( + client_info=functions_info, + credentials=self._credentials, + ) ) - ) return self._cloudfunctionsclient @property def resourcemanagerclient(self): - if not self._resourcemanagerclient: - resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=self._application_name - ) - self._resourcemanagerclient = ( - google.cloud.resourcemanager_v3.ProjectsClient( - credentials=self._credentials, client_info=resourcemanager_info + with self._resourcemanagerclient_lock: + if not self._resourcemanagerclient: + resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=self._application_name + ) + self._resourcemanagerclient = ( + google.cloud.resourcemanager_v3.ProjectsClient( + credentials=self._credentials, client_info=resourcemanager_info + ) ) - ) return self._resourcemanagerclient diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index caf3dfc2bb..51e8e72c9a 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -37,6 +37,10 @@ def get_table_stats(table: bigquery.Table) -> pandas.Series: index.append("columnDtypes") values.append(col_dtypes) + # Add raw BQ schema + index.append("bigquerySchema") + values.append(table.schema) + for key in ("numBytes", "numRows", "location", "type"): index.append(key) values.append(table._properties[key]) @@ -96,8 +100,12 @@ def get_query_stats( ) -> pandas.Series: """Returns important stats from the query job as a Pandas Series.""" - index = [] - values = [] + index: List[Any] = [] + values: List[Any] = [] + + # Add raw BQ schema + index.append("bigquerySchema") + values.append(query_job.schema) job_api_repr = copy.deepcopy(query_job._properties) @@ -110,6 +118,8 @@ def get_query_stats( configuration = job_api_repr.get("configuration", {}) index.append("jobType") values.append(configuration.get("jobType", None)) + index.append("dispatchedSql") + values.append(configuration.get("query", {}).get("query", None)) query_config = configuration.get("query", {}) for key in ("destinationTable", "useLegacySql"): diff --git a/bigframes/session/environment.py b/bigframes/session/environment.py index 3ed6ab98cd..940f8deed4 100644 --- a/bigframes/session/environment.py +++ b/bigframes/session/environment.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. + import importlib import json import os +import pathlib + +Path = pathlib.Path + # The identifier for GCP VS Code extension # https://cloud.google.com/code/docs/vscode/install @@ -29,40 +34,36 @@ def _is_vscode_extension_installed(extension_id: str) -> bool: """ Checks if a given Visual Studio Code extension is installed. - Args: extension_id: The ID of the extension (e.g., "ms-python.python"). - Returns: True if the extension is installed, False otherwise. """ try: # Determine the user's VS Code extensions directory. - user_home = os.path.expanduser("~") - if os.name == "nt": # Windows - vscode_extensions_dir = os.path.join(user_home, ".vscode", "extensions") - elif os.name == "posix": # macOS and Linux - vscode_extensions_dir = os.path.join(user_home, ".vscode", "extensions") - else: - raise OSError("Unsupported operating system.") + user_home = Path.home() + vscode_extensions_dir = user_home / ".vscode" / "extensions" # Check if the extensions directory exists. - if os.path.exists(vscode_extensions_dir): - # Iterate through the subdirectories in the extensions directory. - for item in os.listdir(vscode_extensions_dir): - item_path = os.path.join(vscode_extensions_dir, item) - if os.path.isdir(item_path) and item.startswith(extension_id + "-"): - # Check if the folder starts with the extension ID. - # Further check for manifest file, as a more robust check. - manifest_path = os.path.join(item_path, "package.json") - if os.path.exists(manifest_path): - try: - with open(manifest_path, "r", encoding="utf-8") as f: - json.load(f) - return True - except (FileNotFoundError, json.JSONDecodeError): - # Corrupted or incomplete extension, or manifest missing. - pass + if not vscode_extensions_dir.exists(): + return False + + # Iterate through the subdirectories in the extensions directory. + extension_dirs = filter( + lambda p: p.is_dir() and p.name.startswith(extension_id + "-"), + vscode_extensions_dir.iterdir(), + ) + for extension_dir in extension_dirs: + # As a more robust check, the manifest file must exist. + manifest_path = extension_dir / "package.json" + if not manifest_path.exists() or not manifest_path.is_file(): + continue + + # Finally, the manifest file must be a valid json + with open(manifest_path, "r", encoding="utf-8") as f: + json.load(f) + + return True except Exception: pass @@ -72,10 +73,8 @@ def _is_vscode_extension_installed(extension_id: str) -> bool: def _is_package_installed(package_name: str) -> bool: """ Checks if a Python package is installed. - Args: package_name: The name of the package to check (e.g., "requests", "numpy"). - Returns: True if the package is installed, False otherwise. """ diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index b630dedb7b..ba669a62bb 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -44,7 +44,7 @@ import pandas import pyarrow as pa -from bigframes.core import guid, local_data, utils +from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.schema as schemata @@ -172,7 +172,6 @@ def read_pandas( self, pandas_dataframe: pandas.DataFrame, method: Literal["load", "stream", "write"], - api_name: str, ) -> dataframe.DataFrame: # TODO: Push this into from_pandas, along with index flag from bigframes import dataframe @@ -184,36 +183,55 @@ def read_pandas( [*idx_cols, *val_cols], axis="columns" ) managed_data = local_data.ManagedArrowTable.from_pandas(prepared_df) + block = blocks.Block( + self.read_managed_data(managed_data, method=method), + index_columns=idx_cols, + column_labels=pandas_dataframe.columns, + index_labels=pandas_dataframe.index.names, + ) + return dataframe.DataFrame(block) + def read_managed_data( + self, + data: local_data.ManagedArrowTable, + method: Literal["load", "stream", "write"], + ) -> core.ArrayValue: + offsets_col = guid.generate_guid("upload_offsets_") if method == "load": - array_value = self.load_data(managed_data, api_name=api_name) + gbq_source = self.load_data(data, offsets_col=offsets_col) elif method == "stream": - array_value = self.stream_data(managed_data) + gbq_source = self.stream_data(data, offsets_col=offsets_col) elif method == "write": - array_value = self.write_data(managed_data) + gbq_source = self.write_data(data, offsets_col=offsets_col) else: raise ValueError(f"Unsupported read method {method}") - block = blocks.Block( - array_value, - index_columns=idx_cols, - column_labels=pandas_dataframe.columns, - index_labels=pandas_dataframe.index.names, + return core.ArrayValue.from_bq_data_source( + source=gbq_source, + scan_list=nodes.ScanList( + tuple( + nodes.ScanItem( + identifiers.ColumnId(item.column), item.dtype, item.column + ) + for item in data.schema.items + ) + ), + session=self._session, ) - return dataframe.DataFrame(block) def load_data( - self, data: local_data.ManagedArrowTable, api_name: Optional[str] = None - ) -> core.ArrayValue: + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> nodes.BigqueryDataSource: """Load managed data into bigquery""" - ordering_col = guid.generate_guid("load_offsets_") # JSON support incomplete for item in data.schema.items: _validate_dtype_can_load(item.column, item.dtype) schema_w_offsets = data.schema.append( - schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) ) bq_schema = schema_w_offsets.to_bigquery(_LOAD_JOB_TYPE_OVERRIDES) @@ -227,17 +245,15 @@ def load_data( job_config.parquet_options = parquet_options job_config.schema = bq_schema - if api_name: - job_config.labels = {"bigframes-api": api_name} load_table_destination = self._storage_manager.create_temp_table( - bq_schema, [ordering_col] + bq_schema, [offsets_col] ) buffer = io.BytesIO() data.to_parquet( buffer, - offsets_col=ordering_col, + offsets_col=offsets_col, geo_format="wkt", duration_type="duration", json_type="string", @@ -249,23 +265,24 @@ def load_data( self._start_generic_job(load_job) # must get table metadata after load job for accurate metadata destination_table = self._bqclient.get_table(load_table_destination) - return core.ArrayValue.from_table( - table=destination_table, - schema=schema_w_offsets, - session=self._session, - offsets_col=ordering_col, - n_rows=data.data.num_rows, - ).drop_columns([ordering_col]) + return nodes.BigqueryDataSource( + nodes.GbqTable.from_table(destination_table), + ordering=ordering.TotalOrdering.from_offset_col(offsets_col), + n_rows=data.metadata.row_count, + ) - def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + def stream_data( + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> nodes.BigqueryDataSource: """Load managed data into bigquery""" - ordering_col = guid.generate_guid("stream_offsets_") schema_w_offsets = data.schema.append( - schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) ) bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES) load_table_destination = self._storage_manager.create_temp_table( - bq_schema, [ordering_col] + bq_schema, [offsets_col] ) rows = data.itertuples( @@ -284,24 +301,23 @@ def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}" ) destination_table = self._bqclient.get_table(load_table_destination) - return core.ArrayValue.from_table( - table=destination_table, - schema=schema_w_offsets, - session=self._session, - offsets_col=ordering_col, - n_rows=data.data.num_rows, - ).drop_columns([ordering_col]) + return nodes.BigqueryDataSource( + nodes.GbqTable.from_table(destination_table), + ordering=ordering.TotalOrdering.from_offset_col(offsets_col), + n_rows=data.metadata.row_count, + ) - def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + def write_data( + self, + data: local_data.ManagedArrowTable, + offsets_col: str, + ) -> nodes.BigqueryDataSource: """Load managed data into bigquery""" - ordering_col = guid.generate_guid("stream_offsets_") schema_w_offsets = data.schema.append( - schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) + schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE) ) bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES) - bq_table_ref = self._storage_manager.create_temp_table( - bq_schema, [ordering_col] - ) + bq_table_ref = self._storage_manager.create_temp_table(bq_schema, [offsets_col]) requested_stream = bq_storage_types.stream.WriteStream() requested_stream.type_ = bq_storage_types.stream.WriteStream.Type.COMMITTED # type: ignore @@ -313,7 +329,7 @@ def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]: schema, batches = data.to_arrow( - offsets_col=ordering_col, duration_type="int" + offsets_col=offsets_col, duration_type="int" ) offset = 0 for batch in batches: @@ -339,13 +355,11 @@ def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]: assert response.row_count == data.data.num_rows destination_table = self._bqclient.get_table(bq_table_ref) - return core.ArrayValue.from_table( - table=destination_table, - schema=schema_w_offsets, - session=self._session, - offsets_col=ordering_col, - n_rows=data.data.num_rows, - ).drop_columns([ordering_col]) + return nodes.BigqueryDataSource( + nodes.GbqTable.from_table(destination_table), + ordering=ordering.TotalOrdering.from_offset_col(offsets_col), + n_rows=data.metadata.row_count, + ) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: @@ -368,12 +382,12 @@ def read_gbq_table( # type: ignore[overload-overlap] columns: Iterable[str] = ..., names: Optional[Iterable[str]] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: bool = ..., filters: third_party_pandas_gbq.FiltersType = ..., enable_snapshot: bool = ..., dry_run: Literal[False] = ..., force_total_order: Optional[bool] = ..., + n_rows: Optional[int] = None, ) -> dataframe.DataFrame: ... @@ -390,12 +404,12 @@ def read_gbq_table( columns: Iterable[str] = ..., names: Optional[Iterable[str]] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: bool = ..., filters: third_party_pandas_gbq.FiltersType = ..., enable_snapshot: bool = ..., dry_run: Literal[True] = ..., force_total_order: Optional[bool] = ..., + n_rows: Optional[int] = None, ) -> pandas.Series: ... @@ -411,12 +425,12 @@ def read_gbq_table( columns: Iterable[str] = (), names: Optional[Iterable[str]] = None, max_results: Optional[int] = None, - api_name: str = "read_gbq_table", use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), enable_snapshot: bool = True, dry_run: bool = False, force_total_order: Optional[bool] = None, + n_rows: Optional[int] = None, ) -> dataframe.DataFrame | pandas.Series: import bigframes._tools.strings import bigframes.dataframe as dataframe @@ -543,7 +557,6 @@ def read_gbq_table( query, index_col=index_cols, columns=columns, - api_name=api_name, use_cache=use_cache, dry_run=dry_run, ) @@ -595,7 +608,6 @@ def read_gbq_table( bqclient=self._bqclient, table=table, index_cols=index_cols, - api_name=api_name, # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique metadata_only=not self._scan_index_uniqueness, ) @@ -609,6 +621,7 @@ def read_gbq_table( at_time=time_travel_timestamp if enable_snapshot else None, primary_key=primary_key, session=self._session, + n_rows=n_rows, ) # if we don't have a unique index, we order by row hash if we are in strict mode if ( @@ -654,9 +667,10 @@ def read_gbq_table( renamed_cols: Dict[str, str] = { col: new_name for col, new_name in zip(array_value.column_ids, names) } - index_names = [ - renamed_cols.get(index_col, index_col) for index_col in index_cols - ] + if index_col != bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: + index_names = [ + renamed_cols.get(index_col, index_col) for index_col in index_cols + ] value_columns = [renamed_cols.get(col, col) for col in value_columns] block = blocks.Block( @@ -718,7 +732,6 @@ def read_gbq_query( # type: ignore[overload-overlap] columns: Iterable[str] = ..., configuration: Optional[Dict] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: Optional[bool] = ..., filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[False] = ..., @@ -735,7 +748,6 @@ def read_gbq_query( columns: Iterable[str] = ..., configuration: Optional[Dict] = ..., max_results: Optional[int] = ..., - api_name: str = ..., use_cache: Optional[bool] = ..., filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[True] = ..., @@ -751,7 +763,6 @@ def read_gbq_query( columns: Iterable[str] = (), configuration: Optional[Dict] = None, max_results: Optional[int] = None, - api_name: str = "read_gbq_query", use_cache: Optional[bool] = None, filters: third_party_pandas_gbq.FiltersType = (), dry_run: bool = False, @@ -817,7 +828,6 @@ def read_gbq_query( destination, query_job = self._query_to_destination( query, cluster_candidates=[], - api_name=api_name, configuration=configuration, ) @@ -845,8 +855,8 @@ def read_gbq_query( index_col=index_col, columns=columns, use_cache=configuration["query"]["useQueryCache"], - api_name=api_name, force_total_order=force_total_order, + n_rows=query_job.result().total_rows, # max_results and filters are omitted because they are already # handled by to_query(), above. ) @@ -855,7 +865,6 @@ def _query_to_destination( self, query: str, cluster_candidates: List[str], - api_name: str, configuration: dict = {"query": {"useQueryCache": True}}, do_clustering=True, ) -> Tuple[Optional[bigquery.TableReference], bigquery.QueryJob]: @@ -863,11 +872,9 @@ def _query_to_destination( # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. dry_run_config = bigquery.QueryJobConfig() dry_run_config.dry_run = True - _, dry_run_job = self._start_query( - query, job_config=dry_run_config, api_name=api_name - ) + _, dry_run_job = self._start_query(query, job_config=dry_run_config) if dry_run_job.statement_type != "SELECT": - _, query_job = self._start_query(query, api_name=api_name) + _, query_job = self._start_query(query) return query_job.destination, query_job # Create a table to workaround BigQuery 10 GB query results limit. See: @@ -905,7 +912,6 @@ def _query_to_destination( query, job_config=job_config, timeout=timeout, - api_name=api_name, ) return query_job.destination, query_job except google.api_core.exceptions.BadRequest: @@ -913,7 +919,7 @@ def _query_to_destination( # tables as the destination. For example, if the query has a # top-level ORDER BY, this conflicts with our ability to cluster # the table by the index column(s). - _, query_job = self._start_query(query, timeout=timeout, api_name=api_name) + _, query_job = self._start_query(query, timeout=timeout) return query_job.destination, query_job def _start_query( @@ -921,7 +927,6 @@ def _start_query( sql: str, job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, timeout: Optional[float] = None, - api_name: Optional[str] = None, ) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: """ Starts BigQuery query job and waits for results. @@ -939,7 +944,6 @@ def _start_query( sql, job_config=job_config, timeout=timeout, - api_name=api_name, ) assert query_job is not None return iterator, query_job diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 055abd5899..6a8038e189 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -40,16 +40,15 @@ def count_job_stats( ): if query_job is None: assert row_iterator is not None - if (row_iterator.total_bytes_processed is None) or ( - row_iterator.query is None - ): + total_bytes_processed = getattr(row_iterator, "total_bytes_processed", None) + query = getattr(row_iterator, "query", None) + if total_bytes_processed is None or query is None: return - query_char_count = len(row_iterator.query) - bytes_processed = row_iterator.total_bytes_processed + self.execution_count += 1 - self.query_char_count += query_char_count - self.bytes_processed += bytes_processed - write_stats_to_disk(query_char_count, bytes_processed) + self.query_char_count += len(query) + self.bytes_processed += total_bytes_processed + write_stats_to_disk(len(query), total_bytes_processed) return stats = get_performance_stats(query_job) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index 46d55eb303..d4bbf2783c 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -18,7 +18,7 @@ from google.cloud import bigquery_storage_v1 import pyarrow as pa -from bigframes.core import bigframe_node, rewrite +from bigframes.core import bigframe_node, nodes, pyarrow_utils, rewrite from bigframes.session import executor, semi_executor @@ -39,14 +39,16 @@ def execute( ordered: bool, peek: Optional[int] = None, ) -> Optional[executor.ExecuteResult]: - node = rewrite.try_reduce_to_table_scan(plan) - if not node: + adapt_result = self._try_adapt_plan(plan, ordered) + if not adapt_result: return None + node, limit = adapt_result if node.explicitly_ordered and ordered: return None - if peek: - # TODO: Support peeking - return None + + if limit is not None: + if peek is None or limit < peek: + peek = limit import google.cloud.bigquery_storage_v1.types as bq_storage_types from google.protobuf import timestamp_pb2 @@ -92,16 +94,48 @@ def execute( def process_page(page): pa_batch = page.to_arrow() + pa_batch = pa_batch.select( + [item.source_id for item in node.scan_list.items] + ) return pa.RecordBatch.from_arrays( pa_batch.columns, names=[id.sql for id in node.ids] ) batches = map(process_page, rowstream.pages) + if peek: + batches = pyarrow_utils.truncate_pyarrow_iterable(batches, max_results=peek) + + rows = node.source.n_rows + if peek and rows: + rows = min(peek, rows) + return executor.ExecuteResult( arrow_batches=batches, schema=plan.schema, query_job=None, total_bytes=None, - total_rows=node.source.n_rows, + total_rows=rows, ) + + def _try_adapt_plan( + self, + plan: bigframe_node.BigFrameNode, + ordered: bool, + ) -> Optional[tuple[nodes.ReadTableNode, Optional[int]]]: + """ + Tries to simplify the plan to an equivalent single ReadTableNode and a limit. Otherwise, returns None. + """ + plan, limit = rewrite.pull_out_limit(plan) + # bake_order does not allow slice ops + plan = plan.bottom_up(rewrite.rewrite_slice) + if not ordered: + # gets rid of order_by ops + plan = rewrite.bake_order(plan) + read_table_node = rewrite.try_reduce_to_table_scan(plan) + if read_table_node is None: + return None + if (limit is not None) and (read_table_node.source.ordering is not None): + # read api can only use physical ordering to limit, not a logical ordering + return None + return (read_table_node, limit) diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index 528835f6da..ca6fa57d0b 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -14,7 +14,7 @@ import copy import datetime -from typing import Optional, Sequence +from typing import Any, Dict, Optional, Sequence import unittest.mock as mock import google.auth.credentials @@ -23,12 +23,9 @@ import bigframes import bigframes.clients -import bigframes.core.ordering +import bigframes.core.global_session import bigframes.dataframe -import bigframes.series import bigframes.session.clients -import bigframes.session.executor -import bigframes.session.metrics """Utilities for creating test resources.""" @@ -129,7 +126,10 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs): def create_dataframe( - monkeypatch: pytest.MonkeyPatch, *, session: Optional[bigframes.Session] = None + monkeypatch: pytest.MonkeyPatch, + *, + session: Optional[bigframes.Session] = None, + data: Optional[Dict[str, Sequence[Any]]] = None, ) -> bigframes.dataframe.DataFrame: """[Experimental] Create a mock DataFrame that avoids making Google Cloud API calls. @@ -138,8 +138,11 @@ def create_dataframe( if session is None: session = create_bigquery_session() + if data is None: + data = {"col": []} + # Since this may create a ReadLocalNode, the session we explicitly pass in # might not actually be used. Mock out the global session, too. monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True - return bigframes.dataframe.DataFrame({"col": []}, session=session) + return bigframes.dataframe.DataFrame(data, session=session) diff --git a/tests/unit/polars_session.py b/bigframes/testing/polars_session.py similarity index 96% rename from tests/unit/polars_session.py rename to bigframes/testing/polars_session.py index d592b49038..f8dda8da55 100644 --- a/tests/unit/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -16,6 +16,7 @@ from typing import Optional, Union import weakref +import pandas import polars import bigframes @@ -87,5 +88,7 @@ def __init__(self): def read_pandas(self, pandas_dataframe, write_engine="default"): # override read_pandas to always keep data local-only + if isinstance(pandas_dataframe, pandas.Series): + pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) return bigframes.dataframe.DataFrame(local_block) diff --git a/bigframes/version.py b/bigframes/version.py index 3a34d3d7bb..6cc3d952ed 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.4.0" +__version__ = "2.5.0" # {x-release-please-start-date} -__release_date__ = "2025-05-12" +__release_date__ = "2025-05-30" # {x-release-please-end} diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index f70e81d196..a27f162a9a 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -115,6 +115,8 @@ uid: bigframes.ml.decomposition - name: PCA uid: bigframes.ml.decomposition.PCA + - name: MatrixFactorization + uid: bigframes.ml.decomposition.MatrixFactorization name: decomposition - items: - name: Overview diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb index f830e35c16..b59777a5da 100644 --- a/notebooks/apps/synthetic_data_generation.ipynb +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -111,7 +111,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb index 9edb174f18..8c7790b1ea 100644 --- a/notebooks/dataframes/integrations.ipynb +++ b/notebooks/dataframes/integrations.ipynb @@ -66,9 +66,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/swast/src/bigframes-2/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" + "/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" ] + }, + { + "data": { + "text/html": [ + "Query job 1772ca28-2ef5-425c-87fe-8227aeb9318c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -96,13 +108,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job eb7f3bbe-dda9-4d2f-b195-21de862d7055 is DONE. 0 Bytes processed. Open Job" + "Query job 33bd5814-b594-4ec4-baba-8f6b6e285e48 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -128,13 +140,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 4ad50c3c-91d0-4fef-91f6-0a2c5a30c38f is DONE. 0 Bytes processed. Open Job" + "Query job 1594d97a-1203-4c28-8730-caffb3ac4e9e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -146,10 +158,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_9a045ff143db4f8ab2018994287020f3'" + "'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_7578d5bd9949422599ccb9e4fe6451be'" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -172,13 +184,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 9e7d4b1a-d7fc-4599-bab4-40062c83288e is DONE. 0 Bytes processed. Open Job" + "Query job 8afc1538-9779-487a-a063-def5f438ee11 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -192,11 +204,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 3 4 -0.1250 d\n", - "1 1 2 -0.5000 b\n", + "0 1 2 -0.5000 b\n", + "1 2 3 0.2500 c\n", "2 0 1 1.0000 a\n", - "3 4 5 0.0625 e\n", - "4 2 3 0.2500 c\n" + "3 3 4 -0.1250 d\n", + "4 4 5 0.0625 e\n" ] } ], @@ -238,13 +250,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 62db313e-7632-4dbb-8eff-5035d0e6c27e is DONE. 0 Bytes processed. Open Job" + "Query job b6f68a49-5129-448d-bca3-62a23dced10d is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -258,11 +270,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 1 2 -0.5000 b\n", - "1 3 4 -0.1250 d\n", - "2 0 1 1.0000 a\n", - "3 4 5 0.0625 e\n", - "4 2 3 0.2500 c\n" + "0 3 4 -0.1250 d\n", + "1 1 2 -0.5000 b\n", + "2 4 5 0.0625 e\n", + "3 2 3 0.2500 c\n", + "4 0 1 1.0000 a\n" ] } ], @@ -274,7 +286,7 @@ " table_id = df.to_gbq()\n", "\n", " bqclient = df.bqclient\n", - " token = bqclient._credentials.token\n", + " token = bqclient._http.credentials.token\n", " project_id = bqclient.project\n", "\n", " share_table_and_start_workload(table_id, token, project_id)\n", @@ -335,13 +347,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 1cbd8898-97c7-419e-87af-b72a9432afb6 is DONE. 0 Bytes processed. Open Job" + "Query job 0f205180-cf26-46e5-950d-109947b7f5a1 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -353,10 +365,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_58b9b6fc0c3349bf8d3dd6fb29ab5322'" + "'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_240520e0723548f18fd3bd5d24cbbf82'" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -378,13 +390,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 40e54aa9-fad7-47c3-9bec-144f6c7106d8 is DONE. 0 Bytes processed. Open Job" + "Query job 80177f9a-4f6e-4a4e-97db-f119ea686c62 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -396,10 +408,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_cdb4f54063b0417a8309c462b70239fa'" + "'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_4ca41d2f28f84feca1bbafe9304fd89f'" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -427,16 +439,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" + "Dataset(DatasetReference('bigframes-dev', 'my_dataset'))" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -451,33 +463,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 73cf9e04-d5fa-4765-827c-665f0e6b9e00 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b177eb37-197f-4732-8978-c74cccb36e01 is DONE. 270 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -593,7 +581,7 @@ "[10 rows x 3 columns]" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -683,7 +671,7 @@ ], "metadata": { "kernelspec": { - "display_name": "bigframes", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -697,7 +685,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.10" } }, "nbformat": 4, diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 9f35d3864a..49a9d798e2 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -139,7 +139,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:54: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", "the future.\n", " warnings.warn(msg, category=bfe.PreviewWarning)\n" ] @@ -160,13 +160,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "vCkraKOeqJFl" }, "outputs": [], "source": [ - "bpd.options.bigquery.project = 'YOUR_PROJECT_ID'\n", + "bpd.options.bigquery.project = 'bigframes-dev'\n", "bpd.options.bigquery.location = 'US'" ] }, @@ -227,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "F4dZm4b7iouR" }, @@ -248,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "BoUK-cpbiouS" }, @@ -403,7 +403,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\n", " warnings.warn(msg, bfe.PreviewWarning)\n" ] @@ -575,12 +575,108 @@ "id": "VFObP2aFiouS" }, "source": [ - "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the column name by setting the `output_column` parameter to hold the mapping results." + "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the output column name." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are using BigFrames version `2.5.0` or later, the column name is specified with the `output_schema` parameter. This parameter expects a dictionary input in the form of `{'col_name': 'type_name'}`." ] }, { "cell_type": "code", "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ingredient_1ingredient_2food
0BunBeef PattyHamburger
1Soy BeanBitternTofu
2SausageLong BreadHotdog
\n", + "

3 rows × 3 columns

\n", + "
[3 rows x 3 columns in total]" + ], + "text/plain": [ + " ingredient_1 ingredient_2 food\n", + "0 Bun Beef Patty Hamburger\n", + "1 Soy Bean Bittern Tofu\n", + "2 Sausage Long Bread Hotdog\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", model=gemini_model, output_schema={\"food\": \"string\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are using BigFrames version 2.4.0 or prior, the column name is specified wit the `output_column` parameter. The outputs are always strings." + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -667,7 +763,7 @@ } ], "source": [ - "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" ] }, { @@ -3170,7 +3266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.17" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 788111cfe6..edb864613c 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -430,7 +430,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 31a47ea424..9b05e1ab02 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -1614,7 +1614,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "q_a_model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "q_a_model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb new file mode 100644 index 0000000000..04ea0571df --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb @@ -0,0 +1,770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigFrames LLM Output Schema\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This Notebook introduces BigFrames LLM with output schema to generate structured output dataframes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd\n", + "from bigframes.ml import llm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a BigFrames DataFrame and a Gemini model\n", + "Starting from creating a simple dataframe of several cities and a Gemini model in BigFrames" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city
0Seattle
1New York
2Shanghai
\n", + "

3 rows × 1 columns

\n", + "
[3 rows x 1 columns in total]" + ], + "text/plain": [ + " city\n", + "0 Seattle\n", + "1 New York\n", + "2 Shanghai\n", + "\n", + "[3 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\"city\": [\"Seattle\", \"New York\", \"Shanghai\"]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n" + ] + } + ], + "source": [ + "gemini = llm.GeminiTextGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Generate structured output data\n", + "Before, llm models can only generate text output. Saying if you want to know whether the city is a US city, for example:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityml_generate_text_llm_result
0SeattleYes, Seattle is a city in the United States. I...
1New YorkYes, New York City is a city in the United Sta...
2ShanghaiNo, Shanghai is not a US city. It is a major c...
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city ml_generate_text_llm_result\n", + "0 Seattle Yes, Seattle is a city in the United States. I...\n", + "1 New York Yes, New York City is a city in the United Sta...\n", + "2 Shanghai No, Shanghai is not a US city. It is a major c...\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"], \"is a US city?\"])\n", + "result[[\"city\", \"ml_generate_text_llm_result\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The outputs are text results that human can read. But if want the output data to be more useful for analysis, it is better to transfer to structured data like boolean, int or float values. Usually the process wasn't easy.\n", + "\n", + "Now you can get structured output out-of-the-box by specifying the output_schema parameter in Gemini model predict method. In below example, the outputs are only boolean values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_us_city
0SeattleTrue
1New YorkTrue
2ShanghaiFalse
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city is_us_city\n", + "0 Seattle True\n", + "1 New York True\n", + "2 Shanghai False\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"], \"is a US city?\"], output_schema={\"is_us_city\": \"bool\"})\n", + "result[[\"city\", \"is_us_city\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get float or int values, for example, to get populations in millions:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
citypopulation_in_millions
0Seattle0.75
1New York19.68
2Shanghai26.32
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city population_in_millions\n", + "0 Seattle 0.75\n", + "1 New York 19.68\n", + "2 Shanghai 26.32\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[\"what is the population in millions of\", df[\"city\"]], output_schema={\"population_in_millions\": \"float64\"})\n", + "result[[\"city\", \"population_in_millions\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And yearly rainy days:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityrainy_days
0Seattle152
1New York123
2Shanghai123
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city rainy_days\n", + "0 Seattle 152\n", + "1 New York 123\n", + "2 Shanghai 123\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[\"how many rainy days per year in\", df[\"city\"]], output_schema={\"rainy_days\": \"int64\"})\n", + "result[[\"city\", \"rainy_days\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Generate all types of data in one prediction\n", + "You can get the different output columns and types in one prediction. \n", + "\n", + "Note it doesn't require dedicated prompts, as long as the output column names are informative to the model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_US_citypopulation_in_millionsrainy_days_per_year
0SeattleTrue0.75152
1New YorkTrue8.8121
2ShanghaiFalse26.32115
\n", + "

3 rows × 4 columns

\n", + "
[3 rows x 4 columns in total]" + ], + "text/plain": [ + " city is_US_city population_in_millions rainy_days_per_year\n", + "0 Seattle True 0.75 152\n", + "1 New York True 8.8 121\n", + "2 Shanghai False 26.32 115\n", + "\n", + "[3 rows x 4 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"population_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\"})\n", + "result[[\"city\", \"is_US_city\", \"population_in_millions\", \"rainy_days_per_year\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Generate composite data types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Composite datatypes like array and struct can also be generated. Here the example generates a places_to_visit column as array of strings and a gps_coordinates as struct of floats. Along with previous fields, all in one prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_US_citypopulation_in_millionsrainy_days_per_yearplaces_to_visitgps_coordinates
0SeattleTrue0.74150['Space Needle' 'Pike Place Market' 'Museum of...{'latitude': 47.6062, 'longitude': -122.3321}
1New YorkTrue8.4121['Times Square' 'Central Park' 'Statue of Libe...{'latitude': 40.7128, 'longitude': -74.006}
2ShanghaiFalse26.32115['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori...{'latitude': 31.2304, 'longitude': 121.4737}
\n", + "

3 rows × 6 columns

\n", + "
[3 rows x 6 columns in total]" + ], + "text/plain": [ + " city is_US_city population_in_millions rainy_days_per_year \\\n", + "0 Seattle True 0.74 150 \n", + "1 New York True 8.4 121 \n", + "2 Shanghai False 26.32 115 \n", + "\n", + " places_to_visit \\\n", + "0 ['Space Needle' 'Pike Place Market' 'Museum of... \n", + "1 ['Times Square' 'Central Park' 'Statue of Libe... \n", + "2 ['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori... \n", + "\n", + " gps_coordinates \n", + "0 {'latitude': 47.6062, 'longitude': -122.3321} \n", + "1 {'latitude': 40.7128, 'longitude': -74.006} \n", + "2 {'latitude': 31.2304, 'longitude': 121.4737} \n", + "\n", + "[3 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"population_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\", \"places_to_visit\": \"array\", \"gps_coordinates\": \"struct\"})\n", + "result[[\"city\", \"is_US_city\", \"population_in_millions\", \"rainy_days_per_year\", \"places_to_visit\", \"gps_coordinates\"]]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb index a15209aae4..15929fd666 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -1816,7 +1816,7 @@ "source": [ "## gemini model\n", "\n", - "llm_model = bf_llm.GeminiTextGenerator(model_name = \"gemini-1.5-flash-002\") ## replace with other model as needed" + "llm_model = bf_llm.GeminiTextGenerator(model_name = \"gemini-2.0-flash-001\") ## replace with other model as needed" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index c65a0f0854..413e473c2f 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -581,7 +581,7 @@ ], "source": [ "# Define the model\n", - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")\n", + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", "\n", "# Invoke LLM with prompt\n", "response = predict(zero_shot_prompt, temperature = TEMPERATURE)\n", diff --git a/notebooks/generative_ai/large_language_models.ipynb b/notebooks/generative_ai/large_language_models.ipynb index 4a0d2f2b3c..1d7bc7f6ef 100644 --- a/notebooks/generative_ai/large_language_models.ipynb +++ b/notebooks/generative_ai/large_language_models.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -21,23 +21,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_604997/3896046356.py:1: ApiDeprecationWarning: gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. \n", - " model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/llm.py:981: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + "/tmp/ipykernel_176683/987800245.py:1: ApiDeprecationWarning: gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. \n", + " model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/llm.py:486: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", " self.session = session or global_session.get_global_session()\n" ] }, { "data": { "text/html": [ - "Query job dd2da3cc-27c3-4c6f-9936-4f7769c85090 is DONE. 0 Bytes processed. Open Job" + "Query job 6fa5121a-6da4-4c75-92ec-936799da4513 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -49,7 +49,7 @@ { "data": { "text/html": [ - "Query job 00947011-4d7c-42fa-ae19-3b684976cec6 is DONE. 0 Bytes processed. Open Job" + "Query job 74460ae9-3e89-49e7-93ad-bafbb6197a86 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -60,7 +60,7 @@ } ], "source": [ - "model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")" ] }, { diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 12847483ac..68c5e9f74d 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -1419,7 +1419,7 @@ "source": [ "# from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "# model = GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")\n", + "# model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", "\n", "# pred = model.predict(df)\n", "# pred" diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index ce3f10b881..3f36c2908a 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -55,7 +55,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook is introducing BigFrames experimental Multimodal features:\n", + "This notebook is introducing BigFrames Multimodal features:\n", "1. Create Multimodal DataFrame\n", "2. Combine unstructured data with structured data\n", "3. Conduct image transformations\n", @@ -72,6 +72,22 @@ "### Setup" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the latest bigframes package if bigframes version < 2.4.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install bigframes --upgrade" + ] + }, { "cell_type": "code", "execution_count": null, @@ -84,10 +100,13 @@ }, "outputs": [], "source": [ - "PROJECT = \"bigframes-dev\" # replace with your project, project needs to be allowlisted go/bq-multimodal-allowlist (internal)\n", - "# User must have https://cloud.google.com/bigquery/docs/use-bigquery-dataframes#permissions to use bigframes, BQ connection admin/user to create/use connections, BQ ObjRef permissions for ObjectRef and BQ routines permissions for using transform functions.\n", - "# Or simply has BQ Admin role for all.\n", - "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket.\n", + "PROJECT = \"bigframes-dev\" # replace with your project. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", + "\n", + "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", + "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", + "# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.\n", "\n", "import bigframes\n", "# Setup project\n", @@ -414,13 +433,6 @@ "chunked = df_pdf[\"chunked\"].explode()\n", "chunked" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/noxfile.py b/noxfile.py index 5c20487ccd..297e8f9d6f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -70,9 +70,10 @@ "mock", "asyncmock", PYTEST_VERSION, - "pytest-cov", "pytest-asyncio", + "pytest-cov", "pytest-mock", + "pytest-timeout", ] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] @@ -118,10 +119,10 @@ # Sessions are executed in the order so putting the smaller sessions # ahead to fail fast at presubmit running. nox.options.sessions = [ - "unit", "system-3.9", "system-3.12", "cover", + # TODO(b/401609005): remove "cleanup", ] @@ -228,6 +229,10 @@ def run_unit(session, install_test_extra): session.run( "py.test", "--quiet", + # Any individual test taking longer than 1 mins will be terminated. + "--timeout=60", + # Log 20 slowest tests + "--durations=20", f"--junitxml=unit_{session.python}_sponge_log.xml", "--cov=bigframes", f"--cov={tests_path}", @@ -355,7 +360,7 @@ def run_system( # Run py.test against the system tests. pytest_cmd = [ "py.test", - "--quiet", + "-v", f"-n={num_workers}", # Any individual test taking longer than 15 mins will be terminated. f"--timeout={timeout_seconds}", @@ -424,6 +429,8 @@ def doctest(session: nox.sessions.Session): "third_party/bigframes_vendored/ibis", "--ignore", "bigframes/core/compile/polars", + "--ignore", + "bigframes/testing", ), test_folder="bigframes", check_cov=True, @@ -464,20 +471,31 @@ def cover(session): session.install("coverage", "pytest-cov") # Create a coverage report that includes only the product code. + omitted_paths = [ + # non-prod, unit tested + "bigframes/core/compile/polars/*", + "bigframes/core/compile/sqlglot/*", + # untested + "bigframes/streaming/*", + # utils + "bigframes/testing/*", + ] + session.run( "coverage", "report", "--include=bigframes/*", + # Only unit tested + f"--omit={','.join(omitted_paths)}", "--show-missing", - "--fail-under=85", + "--fail-under=84", ) - # Make sure there is no dead code in our test directories. + # Make sure there is no dead code in our system test directories. session.run( "coverage", "report", "--show-missing", - "--include=tests/unit/*", "--include=tests/system/small/*", # TODO(b/353775058) resume coverage to 100 when the issue is fixed. "--fail-under=99", diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py index 1a15790815..0cc2b1d8b5 100644 --- a/samples/snippets/bigquery_modules_test.py +++ b/samples/snippets/bigquery_modules_test.py @@ -14,6 +14,19 @@ def test_bigquery_dataframes_examples() -> None: + # [START bigquery_dataframes_bigquery_methods_array_agg] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + s = bpd.Series([0, 1, 2, 3, 4, 5]) + + # Group values by whether they are divisble by 2 and aggregate them into arrays + bbq.array_agg(s.groupby(s % 2 == 0)) + # False [1 3 5] + # True [0 2 4] + # dtype: list[pyarrow] + # [END bigquery_dataframes_bigquery_methods_array_agg] + # [START bigquery_dataframes_bigquery_methods_struct] import bigframes.bigquery as bbq import bigframes.pandas as bpd @@ -36,6 +49,22 @@ def test_bigquery_dataframes_examples() -> None: # dtype: struct[pyarrow] # [END bigquery_dataframes_bigquery_methods_struct] + # [START bigquery_dataframes_bigquery_methods_unix_micros] + import pandas as pd + + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Create a series that consists of three timestamps: [1970-01-01, 1970-01-02, 1970-01-03] + s = bpd.Series(pd.date_range("1970-01-01", periods=3, freq="d", tz="UTC")) + + bbq.unix_micros(s) + # 0 0 + # 1 86400000000 + # 2 172800000000 + # dtype: Int64 + # [END bigquery_dataframes_bigquery_methods_unix_micros] + # [START bigquery_dataframes_bigquery_methods_scalar] import bigframes.bigquery as bbq import bigframes.pandas as bpd diff --git a/samples/snippets/gemini_model_test.py b/samples/snippets/gemini_model_test.py index cf809ebb3a..fe5d7d5b1e 100644 --- a/samples/snippets/gemini_model_test.py +++ b/samples/snippets/gemini_model_test.py @@ -30,7 +30,7 @@ def test_gemini_text_generator_model() -> None: session = bpd.get_global_session() connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" model = GeminiTextGenerator( - session=session, connection_name=connection, model_name="gemini-1.5-flash-002" + session=session, connection_name=connection, model_name="gemini-2.0-flash-001" ) df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 368f82d849..7f8e13cd7b 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -78,7 +78,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: # [START bigquery_dataframes_multimodal_dataframe_ml_text] from bigframes.ml import llm - gemini = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-002") + gemini = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") # Deal with first 2 images as example df_image = df_image.head(2) diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index c26c6f4442..bc05cd2512 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -14,16 +14,7 @@ def run_quickstart(project_id: str) -> None: - import bigframes - - session_options = bigframes.BigQueryOptions() - session = bigframes.connect(session_options) - your_gcp_project_id = project_id - query_or_table = "bigquery-public-data.ml_datasets.penguins" - df_session = session.read_gbq(query_or_table) - average_body_mass = df_session["body_mass_g"].mean() - print(f"average_body_mass (df_session): {average_body_mass}") # [START bigquery_bigframes_quickstart] import bigframes.pandas as bpd @@ -33,10 +24,20 @@ def run_quickstart(project_id: str) -> None: # On BigQuery Studio, the project ID is automatically detected. bpd.options.bigquery.project = your_gcp_project_id + # Use "partial" ordering mode to generate more efficient queries, but the + # order of the rows in DataFrames may not be deterministic if you have not + # explictly sorted it. Some operations that depend on the order, such as + # head() will not function until you explictly order the DataFrame. Set the + # ordering mode to "strict" (default) for more pandas compatibility. + bpd.options.bigquery.ordering_mode = "partial" + # Create a DataFrame from a BigQuery table query_or_table = "bigquery-public-data.ml_datasets.penguins" df = bpd.read_gbq(query_or_table) + # Efficiently preview the results using the .peek() method. + df.peek() + # Use the DataFrame just as you would a pandas DataFrame, but calculations # happen in the BigQuery query engine instead of the local system. average_body_mass = df["body_mass_g"].mean() @@ -70,3 +71,7 @@ def run_quickstart(project_id: str) -> None: model.fit(X, y) model.score(X, y) # [END bigquery_bigframes_quickstart] + + # close session and reset option so not to affect other tests + bpd.close_session() + bpd.options.reset() diff --git a/samples/snippets/quickstart_test.py b/samples/snippets/quickstart_test.py index 4abc87d011..a650f8365d 100644 --- a/samples/snippets/quickstart_test.py +++ b/samples/snippets/quickstart_test.py @@ -33,4 +33,4 @@ def test_quickstart( quickstart.run_quickstart(your_project_id) out, _ = capsys.readouterr() - assert "average_body_mass (df_session):" in out + assert "average_body_mass:" in out diff --git a/setup.py b/setup.py index edd8e63e65..ff40d29a16 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ # please keep these in sync with the minimum versions in testing/constraints-3.9.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", - "gcsfs >=2023.3.0", + "gcsfs >=2023.3.0, !=2025.5.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", diff --git a/tests/system/conftest.py b/tests/system/conftest.py index fb7d8d4e32..824e774dbe 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -920,8 +920,8 @@ def llm_text_pandas_df(): @pytest.fixture(scope="session") -def llm_text_df(test_session, llm_text_pandas_df): - return test_session.read_pandas(llm_text_pandas_df) +def llm_text_df(session, llm_text_pandas_df): + return session.read_pandas(llm_text_pandas_df) @pytest.fixture(scope="session") @@ -1494,9 +1494,9 @@ def images_uris() -> list[str]: @pytest.fixture(scope="session") def images_mm_df( - images_uris, test_session: bigframes.Session, bq_connection: str + images_uris, session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: - blob_series = bpd.Series(images_uris, session=test_session).str.to_blob( + blob_series = bpd.Series(images_uris, session=session).str.to_blob( connection=bq_connection ) return blob_series.rename("blob_col").to_frame() @@ -1518,8 +1518,6 @@ def pdf_gcs_path() -> str: @pytest.fixture(scope="session") def pdf_mm_df( - pdf_gcs_path, test_session: bigframes.Session, bq_connection: str + pdf_gcs_path, session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: - return test_session.from_glob_path( - pdf_gcs_path, name="pdf", connection=bq_connection - ) + return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index c189d249a7..3ebded3d29 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -53,9 +53,9 @@ def images_output_uris(images_output_folder: str) -> list[str]: def test_blob_exif( bq_connection: str, - test_session: bigframes.Session, + session: bigframes.Session, ): - exif_image_df = test_session.from_glob_path( + exif_image_df = session.from_glob_path( "gs://bigframes_blob_test/images_exif/*", name="blob_col", connection=bq_connection, @@ -64,7 +64,7 @@ def test_blob_exif( actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection) expected = bpd.Series( ['{"ExifOffset": 47, "Make": "MyCamera"}'], - session=test_session, + session=session, dtype=dtypes.JSON_DTYPE, ) pd.testing.assert_series_equal( @@ -79,9 +79,9 @@ def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - test_session: bigframes.Session, + session: bigframes.Session, ): - series = bpd.Series(images_output_uris, session=test_session).str.to_blob( + series = bpd.Series(images_output_uris, session=session).str.to_blob( connection=bq_connection ) @@ -147,9 +147,9 @@ def test_blob_image_resize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - test_session: bigframes.Session, + session: bigframes.Session, ): - series = bpd.Series(images_output_uris, session=test_session).str.to_blob( + series = bpd.Series(images_output_uris, session=session).str.to_blob( connection=bq_connection ) @@ -217,9 +217,9 @@ def test_blob_image_normalize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - test_session: bigframes.Session, + session: bigframes.Session, ): - series = bpd.Series(images_output_uris, session=test_session).str.to_blob( + series = bpd.Series(images_output_uris, session=session).str.to_blob( connection=bq_connection ) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index d1a5f9f2aa..e0e4b79c6f 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -13,6 +13,7 @@ # limitations under the License. import pandas as pd +import pandas.testing from bigframes.ml import decomposition from tests.system import utils @@ -193,7 +194,16 @@ def test_decomposition_mf_configure_fit_load( ) ) - reloaded_model.score(new_ratings) + # Make sure the input to score is not ignored. + scores_training_data = reloaded_model.score().to_pandas() + scores_new_ratings = reloaded_model.score(new_ratings).to_pandas() + pandas.testing.assert_index_equal( + scores_training_data.columns, scores_new_ratings.columns + ) + assert ( + scores_training_data["mean_squared_error"].iloc[0] + != scores_new_ratings["mean_squared_error"].iloc[0] + ) result = reloaded_model.predict(new_ratings).to_pandas() diff --git a/tests/system/large/operations/conftest.py b/tests/system/large/operations/conftest.py index 4f6e2d1704..6f64c7552f 100644 --- a/tests/system/large/operations/conftest.py +++ b/tests/system/large/operations/conftest.py @@ -22,7 +22,7 @@ def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator: return llm.GeminiTextGenerator( session=session, connection_name=bq_connection, - model_name="gemini-1.5-flash-001", + model_name="gemini-2.0-flash-001", ) diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index 04074a2ea6..1b1d3a3376 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -66,6 +66,31 @@ def test_filter(session, gemini_flash_model): ) +def test_filter_attach_logprob(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "number_1": [1, 2], + "number_2": [2, 1], + "col": [0, 0], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{number_1} is greater than {number_2}", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + def test_filter_multi_model(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -186,7 +211,14 @@ def test_filter_invalid_model_raise_error(): df.ai.filter("{city} is the capital of {country}", None) -def test_map(session, gemini_flash_model): +@pytest.mark.parametrize( + ("output_schema", "output_col"), + [ + pytest.param(None, "ml_generate_text_llm_result", id="default_schema"), + pytest.param({"food": "string"}, "food", id="non_default_schema"), + ], +) +def test_map(session, gemini_flash_model, output_schema, output_col): df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -204,18 +236,18 @@ def test_map(session, gemini_flash_model): ): actual_df = df.ai.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", gemini_flash_model, + output_schema=output_schema, ).to_pandas() # Result sanitation - actual_df["food"] = actual_df["food"].str.strip().str.lower() + actual_df[output_col] = actual_df[output_col].str.strip().str.lower() expected_df = pd.DataFrame( { "ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"], "gluten-free": [True, True], - "food": ["burger", "tofu"], + output_col: ["burger", "tofu"], } ) pandas.testing.assert_frame_equal( @@ -227,6 +259,31 @@ def test_map(session, gemini_flash_model): ) +def test_map_attach_logprob(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + def test_map_multimodel(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -244,8 +301,8 @@ def test_map_multimodel(session, gemini_flash_model): ) result = df.ai.map( "What is the object in {image} combined with {scenario}? One word only.", - "object", gemini_flash_model, + output_schema={"object": "string"}, ).to_pandas() assert len(result) == len(df) @@ -279,7 +336,6 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): ): df.ai.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", gemini_flash_model, ) @@ -319,7 +375,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): THRESHOLD_OPTION, 10, ), pytest.raises(ValueError): - df.ai.map(instruction, "food", gemini_flash_model) + df.ai.map(instruction, gemini_flash_model, output_schema={"food": "string"}) def test_map_invalid_model_raise_error(): @@ -338,7 +394,6 @@ def test_map_invalid_model_raise_error(): ), pytest.raises(TypeError): df.ai.map( "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", None, ) @@ -396,6 +451,34 @@ def test_join(instruction, session, gemini_flash_model): ) +def test_join_attach_logprob(session, gemini_flash_model): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.ai.join( + countries, + "{city} is in {country}", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + @pytest.mark.parametrize( ("reply"), [ @@ -434,7 +517,7 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) def test_self_join(session, gemini_flash_model): animals = dataframe.DataFrame( data={ - "animal": ["spider", "capybara"], + "animal": ["ant", "elephant"], }, session=session, ) @@ -453,8 +536,8 @@ def test_self_join(session, gemini_flash_model): expected_df = pd.DataFrame( { - "animal_left": ["capybara"], - "animal_right": ["spider"], + "animal_left": ["elephant"], + "animal_right": ["ant"], } ) pandas.testing.assert_frame_equal( diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index c3f08c6204..7ae78a5c53 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -22,6 +22,11 @@ import bigframes from bigframes import dataframe, dtypes, exceptions, series +pytest.skip( + "Semantics namespace is deprecated. ", + allow_module_level=True, +) + SEM_OP_EXP_OPTION = "experiments.semantic_operators" BLOB_EXP_OPTION = "experiments.blob" THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" @@ -81,7 +86,7 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): cluster_column=cluster_column, ).to_pandas() - expected_s = pd.Series(["Leonardo \n"], dtype=dtypes.STRING_DTYPE) + expected_s = pd.Series(["Leonardo\n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Movies" pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) @@ -132,12 +137,13 @@ def test_agg_w_int_column(session, gemini_flash_model): "Movies": [ "Killers of the Flower Moon", "The Great Gatsby", + "The Wolf of Wall Street", ], - "Years": [2023, 2013], + "Years": [2023, 2013, 2013], }, session=session, ) - instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Answer with the year only." + instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Your answer should be the four-digit year, returned as a string." with bigframes.option_context( SEM_OP_EXP_OPTION, @@ -150,7 +156,7 @@ def test_agg_w_int_column(session, gemini_flash_model): model=gemini_flash_model, ).to_pandas() - expected_s = pd.Series(["2013 \n"], dtype=dtypes.STRING_DTYPE) + expected_s = pd.Series(["2013\n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Years" pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) @@ -759,7 +765,7 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) def test_self_join(session, gemini_flash_model): animals = dataframe.DataFrame( data={ - "animal": ["spider", "capybara"], + "animal": ["ant", "elephant"], }, session=session, ) @@ -778,8 +784,8 @@ def test_self_join(session, gemini_flash_model): expected_df = pd.DataFrame( { - "animal_left": ["capybara"], - "animal_right": ["spider"], + "animal_left": ["elephant"], + "animal_right": ["ant"], } ) pandas.testing.assert_frame_equal( diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py index b10e361129..87d2acd34b 100644 --- a/tests/system/large/test_dataframe_io.py +++ b/tests/system/large/test_dataframe_io.py @@ -29,6 +29,20 @@ def test_to_pandas_batches_raise_when_large_result_not_allowed(session): next(df.to_pandas_batches(page_size=500, max_results=1500)) +def test_large_df_peek_no_job(session): + execution_count_before = session._metrics.execution_count + + # only works with null index, as sequential index requires row_number over full table scan. + df = session.read_gbq( + WIKIPEDIA_TABLE, index_col=bigframes.enums.DefaultIndexKind.NULL + ) + result = df.peek(50) + execution_count_after = session._metrics.execution_count + + assert len(result) == 50 + assert execution_count_after == execution_count_before + + def test_to_pandas_batches_override_global_option( session, ): diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index d4428c1f95..3ebe2bb040 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -14,8 +14,6 @@ import typing -from google.cloud import bigquery -from google.cloud.bigquery_storage import types as bqstorage_types import pandas import pandas.testing import pytest @@ -41,7 +39,15 @@ def _assert_bq_execution_location( if expected_location is None: expected_location = session._location - assert typing.cast(bigquery.QueryJob, df.query_job).location == expected_location + query_job = df.query_job + assert query_job is not None + assert query_job.location == expected_location + destination = query_job.destination + assert destination is not None + destination_dataset = session.bqclient.get_dataset( + f"{destination.project}.{destination.dataset_id}" + ) + assert destination_dataset.location == expected_location # Ensure operation involving BQ client suceeds result = ( @@ -52,38 +58,28 @@ def _assert_bq_execution_location( .head() ) - assert ( - typing.cast(bigquery.QueryJob, result.query_job).location == expected_location + # Use allow_large_results = True to force a job to be created. + result_pd = result.to_pandas(allow_large_results=True) + + query_job = df.query_job + assert query_job is not None + assert query_job.location == expected_location + destination = query_job.destination + assert destination is not None + destination_dataset = session.bqclient.get_dataset( + f"{destination.project}.{destination.dataset_id}" ) + assert destination_dataset.location == expected_location expected_result = pandas.DataFrame( {"number": [444, 222]}, index=pandas.Index(["aaa", "bbb"], name="name") ) pandas.testing.assert_frame_equal( - expected_result, result.to_pandas(), check_dtype=False, check_index_type=False - ) - - # Ensure BQ Storage Read client operation succceeds - table = result.query_job.destination - requested_session = bqstorage_types.ReadSession( # type: ignore[attr-defined] - table=f"projects/{table.project}/datasets/{table.dataset_id}/tables/{table.table_id}", - data_format=bqstorage_types.DataFormat.ARROW, # type: ignore[attr-defined] - ) - read_session = session.bqstoragereadclient.create_read_session( - parent=f"projects/{table.project}", - read_session=requested_session, - max_stream_count=1, + expected_result, + result_pd, + check_dtype=False, + check_index_type=False, ) - reader = session.bqstoragereadclient.read_rows(read_session.streams[0].name) - frames = [] - for message in reader.rows().pages: - frames.append(message.to_dataframe()) - read_dataframe = pandas.concat(frames) - # normalize before comparing since we lost some of the bigframes column - # naming abtractions in the direct read of the destination table - read_dataframe = read_dataframe.set_index("name") - read_dataframe.columns = result.columns - pandas.testing.assert_frame_equal(expected_result, read_dataframe) def test_bq_location_default(): diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index d346d109be..5cf9621ef9 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -41,9 +41,8 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df): @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-002", - "gemini-1.5-flash-002", "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) def test_llm_gemini_configure_fit( @@ -80,7 +79,7 @@ def test_llm_gemini_configure_fit( @pytest.mark.flaky(retries=2) def test_llm_gemini_w_ground_with_google_search(llm_remote_text_df): - model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-002", max_iterations=1) + model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001", max_iterations=1) df = model.predict( llm_remote_text_df["prompt"], ground_with_google_search=True, diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index b9d21f226a..3d155b5f16 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -99,7 +99,8 @@ def test_json_extract_from_json(): ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract(s, "$.a.b") + with pytest.warns(UserWarning, match="The `json_extract` is deprecated"): + actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) @@ -212,6 +213,34 @@ def test_json_extract_string_array_w_invalid_series_type(): bbq.json_extract_string_array(s) +def test_json_query_from_json(): + s = bpd.Series( + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=dtypes.JSON_DTYPE, + ) + actual = bbq.json_query(s, "$.a.b") + expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_from_string(): + s = bpd.Series( + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query(s, "$.a.b") + expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_w_invalid_series_type(): + s = bpd.Series([1, 2]) + with pytest.raises(TypeError): + bbq.json_query(s, "$.a") + + def test_json_value_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py index 283624100a..c519b427fa 100644 --- a/tests/system/small/bigquery/test_sql.py +++ b/tests/system/small/bigquery/test_sql.py @@ -12,11 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.bigquery +import pandas as pd +import pytest +import bigframes.bigquery as bbq +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd -def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): - series = bigframes.bigquery.sql_scalar( + +def test_sql_scalar_for_all_scalar_types(scalars_df_null_index): + series = bbq.sql_scalar( """ CAST({0} AS INT64) + BYTE_LENGTH({1}) @@ -48,3 +53,109 @@ def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): ) result = series.to_pandas() assert len(result) == len(scalars_df_null_index) + + +def test_sql_scalar_for_bool_series(scalars_df_index): + series: bpd.Series = scalars_df_index["bool_col"] + result = bbq.sql_scalar("CAST({0} AS INT64)", [series]) + expected = series.astype(dtypes.INT_DTYPE) + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +@pytest.mark.parametrize( + ("column_name"), + [ + pytest.param("bool_col"), + pytest.param("bytes_col"), + pytest.param("date_col"), + pytest.param("datetime_col"), + pytest.param("geography_col"), + pytest.param("int64_col"), + pytest.param("numeric_col"), + pytest.param("float64_col"), + pytest.param("string_col"), + pytest.param("time_col"), + pytest.param("timestamp_col"), + ], +) +def test_sql_scalar_outputs_all_scalar_types(scalars_df_index, column_name): + series: bpd.Series = scalars_df_index[column_name] + result = bbq.sql_scalar("{0}", [series]) + expected = series + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_array_series(repeated_df): + result = bbq.sql_scalar( + """ + ARRAY_LENGTH({0}) + ARRAY_LENGTH({1}) + ARRAY_LENGTH({2}) + + ARRAY_LENGTH({3}) + ARRAY_LENGTH({4}) + ARRAY_LENGTH({5}) + + ARRAY_LENGTH({6}) + """, + [ + repeated_df["int_list_col"], + repeated_df["bool_list_col"], + repeated_df["float_list_col"], + repeated_df["date_list_col"], + repeated_df["date_time_list_col"], + repeated_df["numeric_list_col"], + repeated_df["string_list_col"], + ], + ) + + expected = ( + repeated_df["int_list_col"].list.len() + + repeated_df["bool_list_col"].list.len() + + repeated_df["float_list_col"].list.len() + + repeated_df["date_list_col"].list.len() + + repeated_df["date_time_list_col"].list.len() + + repeated_df["numeric_list_col"].list.len() + + repeated_df["string_list_col"].list.len() + ) + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_outputs_array_series(repeated_df): + result = bbq.sql_scalar("{0}", [repeated_df["int_list_col"]]) + expected = repeated_df["int_list_col"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_struct_series(nested_structs_df): + result = bbq.sql_scalar( + "CHAR_LENGTH({0}.name) + {0}.age", + [nested_structs_df["person"]], + ) + expected = nested_structs_df["person"].struct.field( + "name" + ).str.len() + nested_structs_df["person"].struct.field("age") + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_outputs_struct_series(nested_structs_df): + result = bbq.sql_scalar("{0}", [nested_structs_df["person"]]) + expected = nested_structs_df["person"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_json_series(json_df): + result = bbq.sql_scalar( + """JSON_VALUE({0}, '$.int_value')""", + [ + json_df["json_col"], + ], + ) + expected = bbq.json_value(json_df["json_col"], "$.int_value") + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_outputs_json_series(json_df): + result = bbq.sql_scalar("{0}", [json_df["json_col"]]) + expected = json_df["json_col"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 806dad71dc..d3b4c4faa0 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -19,9 +19,9 @@ def test_blob_create_from_uri_str( - bq_connection: str, test_session: bigframes.Session, images_uris + bq_connection: str, session: bigframes.Session, images_uris ): - uri_series = bpd.Series(images_uris, session=test_session) + uri_series = bpd.Series(images_uris, session=session) blob_series = uri_series.str.to_blob(connection=bq_connection) pd_blob_df = blob_series.struct.explode().to_pandas() @@ -40,9 +40,9 @@ def test_blob_create_from_uri_str( def test_blob_create_from_glob_path( - bq_connection: str, test_session: bigframes.Session, images_gcs_path, images_uris + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): - blob_df = test_session.from_glob_path( + blob_df = session.from_glob_path( images_gcs_path, connection=bq_connection, name="blob_col" ) pd_blob_df = ( @@ -68,11 +68,11 @@ def test_blob_create_from_glob_path( def test_blob_create_read_gbq_object_table( - bq_connection: str, test_session: bigframes.Session, images_gcs_path, images_uris + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): - obj_table = test_session._create_object_table(images_gcs_path, bq_connection) + obj_table = session._create_object_table(images_gcs_path, bq_connection) - blob_df = test_session.read_gbq_object_table(obj_table, name="blob_col") + blob_df = session.read_gbq_object_table(obj_table, name="blob_col") pd_blob_df = ( blob_df["blob_col"] .struct.explode() diff --git a/tests/system/small/core/indexes/test_base.py b/tests/system/small/core/indexes/test_base.py new file mode 100644 index 0000000000..05ea40cfb9 --- /dev/null +++ b/tests/system/small/core/indexes/test_base.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from packaging import version +import pandas as pd +import pandas.testing +import pytest + + +@pytest.mark.parametrize("level", [None, 0, 1, "level0", "level1"]) +def test_unique(session, level): + if version.Version(pd.__version__) < version.Version("2.0.0"): + pytest.skip("StringDtype for multi-index not supported until Pandas 2.0") + arrays = [ + pd.Series(["A", "A", "B", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")), + pd.Series([1, 2, 1, 2, 1], dtype=pd.Int64Dtype()), + ] + pd_idx = pd.MultiIndex.from_arrays(arrays, names=["level0", "level1"]) + bf_idx = session.read_pandas(pd_idx) + + actual_result = bf_idx.unique(level).to_pandas() + + expected_result = pd_idx.unique(level) + pandas.testing.assert_index_equal(actual_result, expected_result) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 3f06a02469..3d5453099d 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -108,12 +108,6 @@ def test_create_load_multimodal_embedding_generator_model( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -143,12 +137,6 @@ def test_create_load_gemini_text_generator_model( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -170,12 +158,6 @@ def test_gemini_text_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -199,12 +181,6 @@ def test_gemini_text_generator_predict_with_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -230,13 +206,9 @@ def test_gemini_text_generator_multi_cols_predict_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky(retries=2) @@ -756,8 +728,6 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-002", - "gemini-1.5-flash-002", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), @@ -786,8 +756,6 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name): @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-002", - "gemini-1.5-flash-002", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), @@ -815,11 +783,7 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) @pytest.mark.parametrize( "model_name", - ( - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-2.0-flash-exp", - ), + ("gemini-2.0-flash-exp",), ) def test_gemini_preview_model_warnings(model_name): with pytest.warns(exceptions.PreviewWarning): diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py index e066d00cf5..beee95636f 100644 --- a/tests/system/small/ml/test_multimodal_llm.py +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -23,10 +23,10 @@ @pytest.mark.flaky(retries=2) def test_multimodal_embedding_generator_predict_default_params_success( - images_mm_df, test_session, bq_connection + images_mm_df, session, bq_connection ): text_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=test_session + connection_name=bq_connection, session=session ) df = text_embedding_model.predict(images_mm_df).to_pandas() utils.check_pandas_df_schema_and_index( @@ -41,10 +41,6 @@ def test_multimodal_embedding_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", @@ -52,10 +48,10 @@ def test_multimodal_embedding_generator_predict_default_params_success( ) @pytest.mark.flaky(retries=2) def test_gemini_text_generator_multimodal_input( - images_mm_df: bpd.DataFrame, model_name, test_session, bq_connection + images_mm_df: bpd.DataFrame, model_name, session, bq_connection ): gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=test_session + model_name=model_name, connection_name=bq_connection, session=session ) pd_df = gemini_text_generator_model.predict( images_mm_df, prompt=["Describe", images_mm_df["blob_col"]] @@ -71,20 +67,16 @@ def test_gemini_text_generator_multimodal_input( @pytest.mark.parametrize( "model_name", ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", "gemini-2.0-flash-exp", "gemini-2.0-flash-001", ), ) @pytest.mark.flaky(retries=2) def test_gemini_text_generator_multimodal_structured_output( - images_mm_df: bpd.DataFrame, model_name, test_session, bq_connection + images_mm_df: bpd.DataFrame, model_name, session, bq_connection ): gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=test_session + model_name=model_name, connection_name=bq_connection, session=session ) output_schema = { "bool_output": "bool", diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index de6ba4b86c..25d411bef8 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -51,7 +51,11 @@ def test_filter(session): df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) model = FakeGeminiTextGenerator( dataframe.DataFrame( - {"ml_generate_text_llm_result": ["true", "false"]}, session=session + { + "answer": [True, False], + "full_response": _create_dummy_full_response(2), + }, + session=session, ), ) @@ -77,7 +81,11 @@ def test_map(session): df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) model = FakeGeminiTextGenerator( dataframe.DataFrame( - {"ml_generate_text_llm_result": ["true", "false"]}, session=session + { + "output": ["true", "false"], + "full_response": _create_dummy_full_response(2), + }, + session=session, ), ) @@ -87,7 +95,9 @@ def test_map(session): THRESHOLD_OPTION, 50, ): - result = df.ai.map("map {col}", model=model, output_column="output").to_pandas() + result = df.ai.map( + "map {col}", model=model, output_schema={"output": "string"} + ).to_pandas() pandas.testing.assert_frame_equal( result, @@ -102,7 +112,13 @@ def test_join(session): left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) model = FakeGeminiTextGenerator( - dataframe.DataFrame({"ml_generate_text_llm_result": ["true"]}, session=session), + dataframe.DataFrame( + { + "answer": [True], + "full_response": _create_dummy_full_response(1), + }, + session=session, + ), ) with bigframes.option_context( @@ -139,3 +155,9 @@ def test_top_k(session): result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() assert len(result) == 1 + + +def _create_dummy_full_response(row_count: int) -> pd.Series: + entry = """{"candidates": [{"avg_logprobs": -0.5}]}""" + + return pd.Series([entry] * row_count) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index e4824875b4..032d93c19d 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -98,6 +98,7 @@ def test_str_extract(scalars_dfs, pat): (re.compile("(?i).e.."), "blah", None, 0, True), ("H", "h", True, 0, False), (", ", "__", True, 0, False), + (re.compile(r"hEllo", flags=re.I), "blah", None, 0, True), ], ) def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 00ce0c722b..a821901e4c 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -19,6 +19,7 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): + executions_before_sql = maybe_ordered_session._metrics.execution_count df = maybe_ordered_session._read_gbq_colab( """ SELECT @@ -32,9 +33,11 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi LIMIT 300 """ ) + executions_before_python = maybe_ordered_session._metrics.execution_count batches = df.to_pandas_batches( page_size=100, ) + executions_after = maybe_ordered_session._metrics.execution_count total_rows = 0 for batch in batches: @@ -42,6 +45,55 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi total_rows += len(batch.index) assert total_rows > 0 + assert executions_after == executions_before_python == executions_before_sql + 1 + + +def test_read_gbq_colab_peek_avoids_requery(maybe_ordered_session): + executions_before_sql = maybe_ordered_session._metrics.execution_count + df = maybe_ordered_session._read_gbq_colab( + """ + SELECT + name, + SUM(number) AS total + FROM + `bigquery-public-data.usa_names.usa_1910_2013` + WHERE state LIKE 'W%' + GROUP BY name + ORDER BY total DESC + LIMIT 300 + """ + ) + executions_before_python = maybe_ordered_session._metrics.execution_count + result = df.peek(100) + executions_after = maybe_ordered_session._metrics.execution_count + + # Ok, this isn't guaranteed by peek, but should happen with read api based impl + # if starts failing, maybe stopped using read api? + assert result["total"].is_monotonic_decreasing + + assert len(result) == 100 + assert executions_after == executions_before_python == executions_before_sql + 1 + + +def test_read_gbq_colab_repr_avoids_requery(maybe_ordered_session): + executions_before_sql = maybe_ordered_session._metrics.execution_count + df = maybe_ordered_session._read_gbq_colab( + """ + SELECT + name, + SUM(number) AS total + FROM + `bigquery-public-data.usa_names.usa_1910_2013` + WHERE state LIKE 'W%' + GROUP BY name + ORDER BY total DESC + LIMIT 300 + """ + ) + executions_before_python = maybe_ordered_session._metrics.execution_count + _ = repr(df) + executions_after = maybe_ordered_session._metrics.execution_count + assert executions_after == executions_before_python == executions_before_sql + 1 def test_read_gbq_colab_includes_formatted_scalars(session): @@ -73,3 +125,31 @@ def test_read_gbq_colab_includes_formatted_scalars(session): } ), ) + + +def test_read_gbq_colab_includes_formatted_bigframes_dataframe( + session, scalars_df_index, scalars_pandas_df_index +): + pyformat_args = { + # Apply some operations to make sure the columns aren't renamed. + "some_dataframe": scalars_df_index[scalars_df_index["int64_col"] > 0].assign( + int64_col=scalars_df_index["int64_too"] + ), + # This is not a supported type, but ignored if not referenced. + "some_object": object(), + } + df = session._read_gbq_colab( + """ + SELECT int64_col, rowindex + FROM {some_dataframe} + ORDER BY rowindex ASC + """, + pyformat_args=pyformat_args, + ) + result = df.to_pandas() + expected = ( + scalars_pandas_df_index[scalars_pandas_df_index["int64_col"] > 0] + .assign(int64_col=scalars_pandas_df_index["int64_too"]) + .reset_index(drop=False)[["int64_col", "rowindex"]] + ) + pandas.testing.assert_frame_equal(result, expected) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 00c11d073e..c80ced45a5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1635,6 +1635,48 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): ) +def test_self_merge_self_w_on_args(): + data = { + "A": pd.Series([1, 2, 3], dtype="Int64"), + "B": pd.Series([1, 2, 3], dtype="Int64"), + "C": pd.Series([100, 200, 300], dtype="Int64"), + "D": pd.Series(["alpha", "beta", "gamma"], dtype="string[pyarrow]"), + } + df = pd.DataFrame(data) + + df1 = df[["A", "C"]] + df2 = df[["B", "C", "D"]] + pd_result = df1.merge(df2, left_on=["A", "C"], right_on=["B", "C"], how="inner") + + bf_df = bpd.DataFrame(data) + + bf_df1 = bf_df[["A", "C"]] + bf_df2 = bf_df[["B", "C", "D"]] + bf_result = bf_df1.merge( + bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" + ).to_pandas() + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("decimals",), + [ + (2,), + ({"float64_col": 0, "bool_col": 1, "int64_too": -3},), + ({},), + ], +) +def test_dataframe_round(scalars_dfs, decimals): + if pd.__version__.startswith("1."): + pytest.skip("Rounding doesn't work as expected in pandas 1.x") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.round(decimals).to_pandas() + pd_result = scalars_pandas_df.round(decimals) + + assert_pandas_df_equal(bf_result, pd_result) + + def test_get_dtypes(scalars_df_default_index): dtypes = scalars_df_default_index.dtypes dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { @@ -3619,9 +3661,7 @@ def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): scalars_df_index.iat[index] -def test_iloc_single_integer_out_of_bound_error( - scalars_df_index, scalars_pandas_df_index -): +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): scalars_df_index.iloc[99] @@ -3636,6 +3676,17 @@ def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): ) +def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): + idx_list = [0, 3, 5] + bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() pd_result = scalars_pandas_df_index.loc[:, "int64_col"] @@ -5601,3 +5652,29 @@ def test_astype_invalid_type_fail(scalars_dfs): with pytest.raises(TypeError, match=r".*Share your usecase with.*"): bf_df.astype(123) + + +def test_agg_with_dict(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": ["min", "count"], + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): + bf_df, _ = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "nonexisting_col": ["count"], + } + + with pytest.raises(KeyError): + bf_df.agg(agg_funcs) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d24b592b0d..5df7283e3c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -15,6 +15,8 @@ from typing import Tuple import google.api_core.exceptions +import numpy +import numpy.testing import pandas as pd import pandas.testing import pyarrow as pa @@ -319,7 +321,8 @@ def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): result = bf_df.to_pandas(dry_run=True) - assert len(result) == 14 + assert isinstance(result, pd.Series) + assert len(result) > 0 def test_to_arrow_override_global_option(scalars_df_index): @@ -458,7 +461,7 @@ def test_to_csv_tabs( [True, False], ) @pytest.mark.skipif(pandas_gbq is None, reason="required by pd.read_gbq") -def test_to_gbq_index(scalars_dfs, dataset_id, index): +def test_to_gbq_w_index(scalars_dfs, dataset_id, index): """Test the `to_gbq` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs destination_table = f"{dataset_id}.test_index_df_to_gbq_{index}" @@ -485,48 +488,67 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): pd.testing.assert_frame_equal(df_out, expected, check_index_type=False) -@pytest.mark.parametrize( - ("if_exists", "expected_index"), - [ - pytest.param("replace", 1), - pytest.param("append", 2), - pytest.param( - "fail", - 0, - marks=pytest.mark.xfail( - raises=google.api_core.exceptions.Conflict, - ), - ), - pytest.param( - "unknown", - 0, - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], -) -@pytest.mark.skipif(pandas_gbq is None, reason="required by pd.read_gbq") -def test_to_gbq_if_exists( - scalars_df_default_index, - scalars_pandas_df_default_index, - dataset_id, - if_exists, - expected_index, -): - """Test the `to_gbq` API with the `if_exists` parameter.""" - destination_table = f"{dataset_id}.test_to_gbq_if_exists_{if_exists}" +def test_to_gbq_if_exists_is_fail(scalars_dfs, dataset_id): + scalars_df, scalars_pandas_df = scalars_dfs + destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_fails" + scalars_df.to_gbq(destination_table) - scalars_df_default_index.to_gbq(destination_table) - scalars_df_default_index.to_gbq(destination_table, if_exists=if_exists) + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) - gcs_df = pd.read_gbq(destination_table) - assert len(gcs_df.index) == expected_index * len( - scalars_pandas_df_default_index.index - ) - pd.testing.assert_index_equal( - gcs_df.columns, scalars_pandas_df_default_index.columns - ) + # Test default value is "fails" + with pytest.raises(ValueError, match="Table already exists"): + scalars_df.to_gbq(destination_table) + + with pytest.raises(ValueError, match="Table already exists"): + scalars_df.to_gbq(destination_table, if_exists="fail") + + +def test_to_gbq_if_exists_is_replace(scalars_dfs, dataset_id): + scalars_df, scalars_pandas_df = scalars_dfs + destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_replace" + scalars_df.to_gbq(destination_table) + + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When replacing a table with same schema + scalars_df.to_gbq(destination_table, if_exists="replace") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When replacing a table with different schema + partitial_scalars_df = scalars_df.drop(columns=["string_col"]) + partitial_scalars_df.to_gbq(destination_table, if_exists="replace") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(partitial_scalars_df) + pd.testing.assert_index_equal(gcs_df.columns, partitial_scalars_df.columns) + + +def test_to_gbq_if_exists_is_append(scalars_dfs, dataset_id): + scalars_df, scalars_pandas_df = scalars_dfs + destination_table = f"{dataset_id}.test_to_gbq_if_exists_is_append" + scalars_df.to_gbq(destination_table) + + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When appending to a table with same schema + scalars_df.to_gbq(destination_table, if_exists="append") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == 2 * len(scalars_pandas_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) + + # When appending to a table with different schema + partitial_scalars_df = scalars_df.drop(columns=["string_col"]) + partitial_scalars_df.to_gbq(destination_table, if_exists="append") + gcs_df = pd.read_gbq(destination_table, index_col="rowindex") + assert len(gcs_df) == 3 * len(partitial_scalars_df) + pd.testing.assert_index_equal(gcs_df.columns, scalars_df.columns) def test_to_gbq_w_duplicate_column_names( @@ -773,6 +795,27 @@ def test_to_gbq_w_clustering_no_destination( assert table.expires is not None +def test_to_gbq_w_clustering_existing_table( + scalars_df_default_index, + dataset_id, + bigquery_client, +): + destination_table = f"{dataset_id}.test_to_gbq_w_clustering_existing_table" + scalars_df_default_index.to_gbq(destination_table) + + table = bigquery_client.get_table(destination_table) + assert table.clustering_fields is None + assert table.expires is None + + with pytest.raises(ValueError, match="Table clustering fields cannot be changed"): + clustering_columns = ["int64_col"] + scalars_df_default_index.to_gbq( + destination_table, + if_exists="replace", + clustering_columns=clustering_columns, + ) + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id") @@ -1020,3 +1063,12 @@ def test_to_sql_query_named_index_excluded( utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) + + +def test_to_numpy(scalars_dfs): + bf_df, pd_df = scalars_dfs + + bf_result = numpy.array(bf_df[["int64_too"]], dtype="int64") + pd_result = numpy.array(pd_df[["int64_too"]], dtype="int64") + + numpy.testing.assert_array_equal(bf_result, pd_result) diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index 78e561c2fd..306b15e67a 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd + import bigframes @@ -35,7 +37,8 @@ def test_to_pandas_dry_run(scalars_df_index): result = index.to_pandas(dry_run=True) - assert len(result) == 14 + assert isinstance(result, pd.Series) + assert len(result) > 0 def test_to_numpy_override_global_option(scalars_df_index): diff --git a/tests/system/small/test_large_local_data.py b/tests/system/small/test_large_local_data.py new file mode 100644 index 0000000000..eddec37132 --- /dev/null +++ b/tests/system/small/test_large_local_data.py @@ -0,0 +1,55 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +import bigframes +from tests.system.utils import assert_pandas_df_equal + +large_dataframe = pd.DataFrame(np.random.rand(10000, 10), dtype="Float64") +large_dataframe.index = large_dataframe.index.astype("Int64") + + +def test_read_pandas_defer_noop(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + + assert_pandas_df_equal(large_dataframe, bf_df.to_pandas()) + + +def test_read_pandas_defer_cumsum(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + bf_df = bf_df.cumsum() + + assert_pandas_df_equal(large_dataframe.cumsum(), bf_df.to_pandas()) + + +def test_read_pandas_defer_cache_cumsum_cumsum(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + bf_df = bf_df.cumsum().cache().cumsum() + + assert_pandas_df_equal(large_dataframe.cumsum().cumsum(), bf_df.to_pandas()) + + +def test_read_pandas_defer_peek(session: bigframes.Session): + pytest.importorskip("pandas", minversion="2.0.0") + bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") + bf_result = bf_df.peek(15) + + assert len(bf_result) == 15 + assert_pandas_df_equal(large_dataframe.loc[bf_result.index], bf_result) diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index d59b6d66b5..55e5036a42 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -279,16 +279,18 @@ def test_credentials_need_reauthentication( # Call get_global_session() *after* read_gbq so that our location detection # has a chance to work. session = bpd.get_global_session() - assert session.bqclient._credentials.valid + assert session.bqclient._http.credentials.valid with monkeypatch.context() as m: # Simulate expired credentials to trigger the credential refresh flow - m.setattr(session.bqclient._credentials, "expiry", datetime.datetime.utcnow()) - assert not session.bqclient._credentials.valid + m.setattr( + session.bqclient._http.credentials, "expiry", datetime.datetime.utcnow() + ) + assert not session.bqclient._http.credentials.valid # Simulate an exception during the credential refresh flow m.setattr( - session.bqclient._credentials, + session.bqclient._http.credentials, "refresh", mock.Mock(side_effect=google.auth.exceptions.RefreshError()), ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 99526a65d2..710e1481be 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -13,12 +13,14 @@ # limitations under the License. import datetime as dt +import json import math import re import tempfile import db_dtypes # type: ignore import geopandas as gpd # type: ignore +import google.api_core.exceptions import numpy from packaging.version import Version import pandas as pd @@ -2130,7 +2132,7 @@ def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_na ], ) def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): - bf_uniq = scalars_df_index[col_name].unique().to_numpy() + bf_uniq = scalars_df_index[col_name].unique().to_numpy(na_value=None) pd_uniq = scalars_pandas_df_index[col_name].unique() numpy.array_equal(pd_uniq, bf_uniq) @@ -2999,6 +3001,17 @@ def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): assert_series_equal(bf_result, pd_result, ignore_order=not ordered) +def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_too"] + bf_result = col_bf.clip(-100, 3.14151593).to_pandas() + + col_pd = scalars_pandas_df_index["int64_too"] + # pandas doesn't work with Int64 and clip with floats + pd_result = col_pd.astype("int64").clip(-100, 3.14151593).astype("Float64") + + assert_series_equal(bf_result, pd_result) + + def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 @@ -3474,9 +3487,11 @@ def foo(x): ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), ("int64_col", "time64[us][pyarrow]"), + ("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), ("bool_col", "Float64"), + ("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), ("string_col", "binary[pyarrow]"), ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and @@ -3541,7 +3556,7 @@ def test_astype_safe(session): pd.testing.assert_series_equal(result, exepcted) -def test_series_astype_error_error(session): +def test_series_astype_w_invalid_error(session): input = pd.Series(["hello", "world", "3.11", "4000"]) with pytest.raises(ValueError): session.read_pandas(input).astype("Float64", errors="bad_value") @@ -3676,6 +3691,119 @@ def test_timestamp_astype_string(): assert bf_result.dtype == "string[pyarrow]" +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_float_astype_json(errors): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_result.index = expected_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_string_astype_json(errors): + data = [ + "1", + None, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', + ] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_string_astype_json_in_safe_mode(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +def test_string_astype_json_raise_error(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + with pytest.raises( + google.api_core.exceptions.BadRequest, + match="syntax error while parsing value", + ): + bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas() + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"), + pytest.param( + ['"str"', None], + dtypes.TIME_DTYPE, + id="invalid", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_json_astype_others(data, to_type, errors): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + + bf_result = bf_series.astype(to_type, errors=errors) + assert bf_result.dtype == to_type + + load_data = [json.loads(item) if item is not None else None for item in data] + expected = pd.Series(load_data, dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_raise_error(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + with pytest.raises(google.api_core.exceptions.BadRequest): + bf_series.astype(to_type, errors="raise").to_pandas() + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_in_safe_mode(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + bf_result = bf_series.astype(to_type, errors="null") + assert bf_result.dtype == to_type + + expected = pd.Series([None, None], dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + @pytest.mark.parametrize( "index", [0, 5, -2], @@ -3687,9 +3815,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): assert bf_result == pd_result -def test_iloc_single_integer_out_of_bound_error( - scalars_df_index, scalars_pandas_df_index -): +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): scalars_df_index.string_col.iloc[99] @@ -4478,4 +4604,5 @@ def test_series_to_pandas_dry_run(scalars_df_index): result = bf_series.to_pandas(dry_run=True) - assert len(result) == 14 + assert isinstance(result, pd.Series) + assert len(result) > 0 diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index 5390d65268..426679d37d 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import numpy +import numpy.testing import pandas as pd import pytest @@ -114,3 +116,12 @@ def test_to_pandas_batches(scalars_dfs, page_size, max_results, allow_large_resu total_rows += actual_rows assert total_rows == expected_total_rows + + +def test_to_numpy(scalars_dfs): + bf_df, pd_df = scalars_dfs + + bf_result = numpy.array(bf_df["int64_too"], dtype="int64") + pd_result = numpy.array(pd_df["int64_too"], dtype="int64") + + numpy.testing.assert_array_equal(bf_result, pd_result) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index eeb242e8da..6e68a759b4 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1369,6 +1369,45 @@ def test_read_csv_for_names_and_index_col( ) +def test_read_csv_for_dtype(session, df_and_gcs_csv_for_two_columns): + _, path = df_and_gcs_csv_for_two_columns + + dtype = {"bool_col": pd.BooleanDtype(), "int64_col": pd.Float64Dtype()} + bf_df = session.read_csv(path, engine="bigquery", dtype=dtype) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv(path, dtype=dtype) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("rowindex").sort_index() + pd_df = pd_df.set_index("rowindex") + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_dtype_w_names(session, df_and_gcs_csv_for_two_columns): + _, path = df_and_gcs_csv_for_two_columns + + names = ["a", "b", "c"] + dtype = {"b": pd.BooleanDtype(), "c": pd.Float64Dtype()} + bf_df = session.read_csv(path, engine="bigquery", names=names, dtype=dtype) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv(path, names=names, dtype=dtype) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("a").sort_index() + pd_df = pd_df.set_index("a") + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + @pytest.mark.parametrize( ("kwargs", "match"), [ @@ -1775,16 +1814,6 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) -def test_read_gbq_test(test_session: bigframes.Session): - test_project_id = "bigframes-dev" - test_dataset_id = "test_env_only" - test_table_id = "one_table" - table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}" - actual = test_session.read_gbq(table_id).to_pandas() - - assert actual.shape == (1, 1) - - @pytest.mark.parametrize( ("query_or_table", "index_col", "columns"), [ @@ -1899,9 +1928,11 @@ def _assert_query_dry_run_stats_are_valid(result: pd.Series): "columnDtypes", "indexLevel", "indexDtypes", + "bigquerySchema", "projectId", "location", "jobType", + "dispatchedSql", "destinationTable", "useLegacySql", "referencedTables", @@ -1922,6 +1953,7 @@ def _assert_table_dry_run_stats_are_valid(result: pd.Series): "isQuery", "columnCount", "columnDtypes", + "bigquerySchema", "numBytes", "numRows", "location", diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index b8f3a612d4..686499aa75 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -38,6 +38,7 @@ ("skip_bq_connection_check", False, True), ("client_endpoints_override", {}, {"bqclient": "endpoint_address"}), ("ordering_mode", "strict", "partial"), + ("requests_transport_adapters", object(), object()), ], ) def test_setter_raises_if_session_started(attribute, original_value, new_value): diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql index 3430b6684f..f5182a380b 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql @@ -6,7 +6,6 @@ WITH `bfcte_0` AS ( FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0), STRUCT(1, -987654321, 1), STRUCT(2, 314159, 2), STRUCT(3, CAST(NULL AS INT64), 3), STRUCT(4, -234892, 4), STRUCT(5, 55555, 5), STRUCT(6, 101202303, 6), STRUCT(7, -214748367, 7), STRUCT(8, 2, 8)]) ) SELECT - `bfcol_3` AS `bfcol_5`, - `bfcol_4` AS `bfcol_6`, - `bfcol_2` AS `bfcol_7` + `bfcol_3` AS `rowindex`, + `bfcol_4` AS `int64_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index f04f9ed023..d7e47b6032 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -10,7 +10,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'), 123456789, 0, - 1.234567890, + CAST(1.234567890 AS NUMERIC), 1.25, 0, 0, @@ -27,7 +27,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-71.104 42.315)'), -987654321, 1, - 1.234567890, + CAST(1.234567890 AS NUMERIC), 2.51, 1, 1, @@ -44,7 +44,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'), 314159, 0, - 101.101010100, + CAST(101.101010100 AS NUMERIC), 25000000000.0, 2, 2, @@ -95,7 +95,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS GEOGRAPHY), 55555, 0, - 5.555555000, + CAST(5.555555000 AS NUMERIC), 555.555, 5, 5, @@ -112,7 +112,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'), 101202303, 2, - -10.090807000, + CAST(-10.090807000 AS NUMERIC), -123.456, 6, 6, @@ -129,7 +129,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS GEOGRAPHY), -214748367, 2, - 11111111.100000000, + CAST(11111111.100000000 AS NUMERIC), 42.42, 7, 7, @@ -157,20 +157,19 @@ WITH `bfcte_0` AS ( )]) ) SELECT - `bfcol_0` AS `bfcol_16`, - `bfcol_1` AS `bfcol_17`, - `bfcol_2` AS `bfcol_18`, - `bfcol_3` AS `bfcol_19`, - `bfcol_4` AS `bfcol_20`, - `bfcol_5` AS `bfcol_21`, - `bfcol_6` AS `bfcol_22`, - `bfcol_7` AS `bfcol_23`, - `bfcol_8` AS `bfcol_24`, - `bfcol_9` AS `bfcol_25`, - `bfcol_10` AS `bfcol_26`, - `bfcol_11` AS `bfcol_27`, - `bfcol_12` AS `bfcol_28`, - `bfcol_13` AS `bfcol_29`, - `bfcol_14` AS `bfcol_30`, - `bfcol_15` AS `bfcol_31` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `bool_col`, + `bfcol_2` AS `bytes_col`, + `bfcol_3` AS `date_col`, + `bfcol_4` AS `datetime_col`, + `bfcol_5` AS `geography_col`, + `bfcol_6` AS `int64_col`, + `bfcol_7` AS `int64_too`, + `bfcol_8` AS `numeric_col`, + `bfcol_9` AS `float64_col`, + `bfcol_10` AS `rowindex_1`, + `bfcol_11` AS `rowindex_2`, + `bfcol_12` AS `string_col`, + `bfcol_13` AS `time_col`, + `bfcol_14` AS `timestamp_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql index c0e5a0a476..31b46e6c70 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -4,6 +4,5 @@ WITH `bfcte_0` AS ( FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) ) SELECT - `bfcol_0` AS `bfcol_2`, - `bfcol_1` AS `bfcol_3` + `bfcol_0` AS `json_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql index c97babdaef..1ba602f205 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql @@ -34,13 +34,12 @@ WITH `bfcte_0` AS ( )]) ) SELECT - `bfcol_0` AS `bfcol_9`, - `bfcol_1` AS `bfcol_10`, - `bfcol_2` AS `bfcol_11`, - `bfcol_3` AS `bfcol_12`, - `bfcol_4` AS `bfcol_13`, - `bfcol_5` AS `bfcol_14`, - `bfcol_6` AS `bfcol_15`, - `bfcol_7` AS `bfcol_16`, - `bfcol_8` AS `bfcol_17` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `bool_list_col`, + `bfcol_3` AS `float_list_col`, + `bfcol_4` AS `date_list_col`, + `bfcol_5` AS `date_time_list_col`, + `bfcol_6` AS `numeric_list_col`, + `bfcol_7` AS `string_list_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql index 509e63e029..54d1a1bb2b 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -20,7 +20,6 @@ WITH `bfcte_0` AS ( )]) ) SELECT - `bfcol_0` AS `bfcol_3`, - `bfcol_1` AS `bfcol_4`, - `bfcol_2` AS `bfcol_5` + `bfcol_0` AS `id`, + `bfcol_1` AS `person` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py index ab6402a909..9534c8605a 100644 --- a/tests/unit/core/test_expression.py +++ b/tests/unit/core/test_expression.py @@ -12,43 +12,107 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + +import pytest + +from bigframes.core import field import bigframes.core.expression as ex import bigframes.core.identifiers as ids import bigframes.dtypes as dtypes import bigframes.operations as ops -def test_expression_dtype_simple(): +def test_simple_expression_dtype(): expression = ops.add_op.as_expr("a", "b") - result = expression.output_type( - {ids.ColumnId("a"): dtypes.INT_DTYPE, ids.ColumnId("b"): dtypes.INT_DTYPE} + field_bindings = _create_field_bindings( + {"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE} ) - assert result == dtypes.INT_DTYPE + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.INT_DTYPE) -def test_expression_dtype_nested(): +def test_nested_expression_dtype(): expression = ops.add_op.as_expr( "a", ops.abs_op.as_expr(ops.sub_op.as_expr("b", ex.const(3.14))) ) - - result = expression.output_type( - {ids.ColumnId("a"): dtypes.INT_DTYPE, ids.ColumnId("b"): dtypes.INT_DTYPE} + field_bindings = _create_field_bindings( + {"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE} ) - assert result == dtypes.FLOAT_DTYPE + result = ex.bind_schema_fields(expression, field_bindings) + _assert_output_type(result, dtypes.FLOAT_DTYPE) -def test_expression_dtype_where(): - expression = ops.where_op.as_expr(ex.const(3), ex.const(True), ex.const(None)) - result = expression.output_type({}) +def test_where_op_dtype(): + expression = ops.where_op.as_expr(ex.const(3), ex.const(True), ex.const(None)) - assert result == dtypes.INT_DTYPE + _assert_output_type(expression, dtypes.INT_DTYPE) -def test_expression_dtype_astype(): +def test_astype_op_dtype(): expression = ops.AsTypeOp(dtypes.INT_DTYPE).as_expr(ex.const(3.14159)) - result = expression.output_type({}) + _assert_output_type(expression, dtypes.INT_DTYPE) + + +def test_deref_op_dtype_unavailable(): + expression = ex.deref("mycol") + + assert not expression.is_resolved + with pytest.raises(ValueError): + expression.output_type + + +def test_deref_op_dtype_resolution(): + expression = ex.deref("mycol") + field_bindings = _create_field_bindings({"mycol": dtypes.STRING_DTYPE}) + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.STRING_DTYPE) + + +def test_field_ref_expr_dtype_resolution_short_circuit(): + expression = ex.SchemaFieldRefExpression( + field.Field(ids.ColumnId("mycol"), dtype=dtypes.INT_DTYPE) + ) + field_bindings = _create_field_bindings({"anotherCol": dtypes.STRING_DTYPE}) + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.INT_DTYPE) + + +def test_nested_expression_dtypes_are_cached(): + expression = ops.add_op.as_expr(ex.deref("left_col"), ex.deref("right_col")) + field_bindings = _create_field_bindings( + { + "right_col": dtypes.INT_DTYPE, + "left_col": dtypes.FLOAT_DTYPE, + } + ) + + result = ex.bind_schema_fields(expression, field_bindings) + + _assert_output_type(result, dtypes.FLOAT_DTYPE) + assert isinstance(result, ex.OpExpression) + _assert_output_type(result.inputs[0], dtypes.FLOAT_DTYPE) + _assert_output_type(result.inputs[1], dtypes.INT_DTYPE) + + +def _create_field_bindings( + col_dtypes: typing.Dict[str, dtypes.Dtype] +) -> typing.Dict[ids.ColumnId, field.Field]: + return { + ids.ColumnId(col): field.Field(ids.ColumnId(col), dtype) + for col, dtype in col_dtypes.items() + } + - assert result == dtypes.INT_DTYPE +def _assert_output_type(expr: ex.Expression, dtype: dtypes.Dtype): + assert expr.is_resolved + assert expr.output_type == dtype diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index 811c64a27b..eba015dd9d 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -45,6 +45,10 @@ def method2(self): def method3(self): pass + @log_adapter.log_name_override("override_name") + def method4(self): + pass + @property def my_field(self): return 0 @@ -52,55 +56,57 @@ def my_field(self): return TestClass() -def test_method_logging(test_instance): +@pytest.fixture +def test_method(): + @log_adapter.method_logger + def method1(): + pass + + return method1 + + +@pytest.fixture +def test_method_w_custom_base(): + def method1(): + pass + + _decorated_method = log_adapter.method_logger(method1, custom_base_name="pandas") + + return _decorated_method + + +def test_class_attribute_logging(test_instance): test_instance.method1() test_instance.method2() + test_instance.method4() # Check if the methods were added to the _api_methods list api_methods = log_adapter.get_and_reset_api_methods() assert "testclass-method1" in api_methods assert "testclass-method2" in api_methods assert "testclass-method3" not in api_methods + assert "testclass-method4" not in api_methods + assert "testclass-override_name" in api_methods -def test_property_logging(test_instance): - test_instance.my_field - - # Check if the properties were added to the _api_methods list +def test_method_logging(test_method): + test_method() api_methods = log_adapter.get_and_reset_api_methods() - assert "testclass-my_field" in api_methods + assert "locals-method1" in api_methods -def test_method_logging__include_internal_calls(): - @log_adapter.class_logger(include_internal_calls=True) - class TestClass: - def public_method(self): - self._internal_method() - - def _internal_method(self): - pass - - TestClass().public_method() - +def test_method_logging_with_custom_base_name(test_method_w_custom_base): + test_method_w_custom_base() api_methods = log_adapter.get_and_reset_api_methods() - assert "testclass-public_method" in api_methods - assert "testclass-_internal_method" in api_methods + assert "pandas-method1" in api_methods -def test_method_logging__exclude_internal_calls(): - @log_adapter.class_logger(include_internal_calls=False) - class TestClass: - def public_method(self): - self._internal_method() - - def _internal_method(self): - pass - - TestClass().public_method() +def test_property_logging(test_instance): + test_instance.my_field + # Check if the properties were added to the _api_methods list api_methods = log_adapter.get_and_reset_api_methods() - assert "testclass-public_method" in api_methods - assert "testclass-_internal_method" not in api_methods + assert "testclass-my_field" in api_methods def test_add_api_method_limit(test_instance): @@ -176,6 +182,20 @@ def test_get_and_reset_api_methods(test_instance): "args_count": 0, }, ), + ( + "pandas", + "concat", + [[None, None]], + {"axis": 1}, + log_adapter.PANDAS_API_TRACKING_TASK, + { + "task": log_adapter.PANDAS_API_TRACKING_TASK, + "class_name": "pandas", + "method_name": "concat", + "args_count": 1, + "kwargs_0": "axis", + }, + ), ), ) def test_submit_pandas_labels( diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 62cfe09704..10fefcc457 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -81,6 +81,7 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + type(mock_X).sql = mock.PropertyMock(return_value="input_X_sql_property") mock_X.reset_index(drop=True).cache().sql = "input_X_no_index_sql" mock_X.join(mock_y).sql = "input_X_y_sql" mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) @@ -248,7 +249,7 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): ) -def test_decomposition_mf_score(mock_session, bqml_model, mock_X): +def test_decomposition_mf_score(mock_session, bqml_model): model = decomposition.MatrixFactorization( num_factors=34, feedback_type="explicit", @@ -258,8 +259,23 @@ def test_decomposition_mf_score(mock_session, bqml_model, mock_X): l2_reg=9.83, ) model._bqml_model = bqml_model - model.score(mock_X) - + model.score() mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)" ) + + +def test_decomposition_mf_score_with_x(mock_session, bqml_model, mock_X): + model = decomposition.MatrixFactorization( + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model = bqml_model + model.score(mock_X) + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql_property))" + ) diff --git a/tests/unit/session/test_clients.py b/tests/unit/session/test_clients.py index c9a12be584..5304c99466 100644 --- a/tests/unit/session/test_clients.py +++ b/tests/unit/session/test_clients.py @@ -13,25 +13,24 @@ # limitations under the License. import os -from typing import Optional +import pathlib +import tempfile +from typing import cast, Optional import unittest.mock as mock -import google.api_core.client_info -import google.api_core.client_options -import google.api_core.exceptions -import google.api_core.gapic_v1.client_info import google.auth.credentials import google.cloud.bigquery import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 +import requests.adapters import bigframes.session.clients as clients import bigframes.version -def create_clients_provider(application_name: Optional[str] = None): +def create_clients_provider(application_name: Optional[str] = None, **kwargs): credentials = mock.create_autospec(google.auth.credentials.Credentials) return clients.ClientsProvider( project="test-project", @@ -40,12 +39,15 @@ def create_clients_provider(application_name: Optional[str] = None): credentials=credentials, application_name=application_name, bq_kms_key_name="projects/my-project/locations/us/keyRings/myKeyRing/cryptoKeys/myKey", + **kwargs, ) def monkeypatch_client_constructors(monkeypatch): bqclient = mock.create_autospec(google.cloud.bigquery.Client) bqclient.return_value = bqclient + # Assume we have a new client library in the unit tests. + bqclient.default_job_creation_mode = None # type: ignore monkeypatch.setattr(google.cloud.bigquery, "Client", bqclient) bqconnectionclient = mock.create_autospec( @@ -83,6 +85,11 @@ def monkeypatch_client_constructors(monkeypatch): ) +def assert_bqclient_sets_default_job_creation_mode(provider: clients.ClientsProvider): + bqclient = provider.bqclient + assert bqclient.default_job_creation_mode == "JOB_CREATION_OPTIONAL" + + def assert_constructed_w_user_agent(mock_client: mock.Mock, expected_user_agent: str): assert ( expected_user_agent @@ -127,6 +134,24 @@ def assert_clients_wo_user_agent( ) +def test_requests_transport_adapters_pool_maxsize(monkeypatch): + monkeypatch_client_constructors(monkeypatch) + requests_transport_adapters = ( + ("http://", requests.adapters.HTTPAdapter(pool_maxsize=123)), + ("https://", requests.adapters.HTTPAdapter(pool_maxsize=123)), + ) # doctest: +SKIP + provider = create_clients_provider( + requests_transport_adapters=requests_transport_adapters + ) + + _, kwargs = cast(mock.Mock, provider.bqclient).call_args + requests_session = kwargs.get("_http") + adapter: requests.adapters.HTTPAdapter = requests_session.get_adapter( + "https://bigquery.googleapis.com/" + ) + assert adapter._pool_maxsize == 123 # type: ignore + + def test_user_agent_default(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider(application_name=None) @@ -148,6 +173,7 @@ def test_user_agent_not_in_vscode(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_wo_user_agent(provider, "vscode") + assert_clients_wo_user_agent(provider, "googlecloudtools.cloudcode") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") @@ -158,16 +184,48 @@ def test_user_agent_in_vscode(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_w_user_agent(provider, "vscode") + assert_clients_wo_user_agent(provider, "googlecloudtools.cloudcode") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") +@mock.patch.dict(os.environ, {"VSCODE_PID": "12345"}, clear=True) +def test_user_agent_in_vscode_w_extension(monkeypatch): + monkeypatch_client_constructors(monkeypatch) + + with tempfile.TemporaryDirectory() as tmpdir: + user_home = pathlib.Path(tmpdir) + extension_dir = ( + user_home / ".vscode" / "extensions" / "googlecloudtools.cloudcode-0.12" + ) + extension_config = extension_dir / "package.json" + + # originally extension config does not exist + assert not extension_config.exists() + + # simulate extension installation by creating extension config on disk + extension_dir.mkdir(parents=True) + with open(extension_config, "w") as f: + f.write("{}") + + with mock.patch("pathlib.Path.home", return_value=user_home): + provider = create_clients_provider() + assert_clients_w_user_agent(provider, "vscode") + assert_clients_w_user_agent(provider, "googlecloudtools.cloudcode") + + # We still need to include attribution to bigframes + assert_clients_w_user_agent( + provider, f"bigframes/{bigframes.version.__version__}" + ) + + @mock.patch.dict(os.environ, {}, clear=True) def test_user_agent_not_in_jupyter(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_wo_user_agent(provider, "jupyter") + assert_clients_wo_user_agent(provider, "bigquery_jupyter_plugin") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") @@ -178,6 +236,37 @@ def test_user_agent_in_jupyter(monkeypatch): monkeypatch_client_constructors(monkeypatch) provider = create_clients_provider() assert_clients_w_user_agent(provider, "jupyter") + assert_clients_wo_user_agent(provider, "bigquery_jupyter_plugin") # We still need to include attribution to bigframes assert_clients_w_user_agent(provider, f"bigframes/{bigframes.version.__version__}") + + +@mock.patch.dict(os.environ, {"JPY_PARENT_PID": "12345"}, clear=True) +def test_user_agent_in_jupyter_with_plugin(monkeypatch): + monkeypatch_client_constructors(monkeypatch) + + def custom_import_module_side_effect(name, package=None): + if name == "bigquery_jupyter_plugin": + return mock.MagicMock() + else: + import importlib + + return importlib.import_module(name, package) + + assert isinstance( + custom_import_module_side_effect("bigquery_jupyter_plugin"), mock.MagicMock + ) + assert custom_import_module_side_effect("bigframes") is bigframes + + with mock.patch( + "importlib.import_module", side_effect=custom_import_module_side_effect + ): + provider = create_clients_provider() + assert_clients_w_user_agent(provider, "jupyter") + assert_clients_w_user_agent(provider, "bigquery_jupyter_plugin") + + # We still need to include attribution to bigframes + assert_clients_w_user_agent( + provider, f"bigframes/{bigframes.version.__version__}" + ) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 14e5d1c2fe..e5e2c58d59 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -231,7 +231,6 @@ def test_start_query_with_client_labels_length_limit_met( sql, job_config, timeout=timeout, - api_name=api_name, ) assert job_config.labels is not None diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index a27abd5f6c..cffc6b3af7 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -14,6 +14,10 @@ """Unit tests for read_gbq_colab helper functions.""" +import textwrap + +from google.cloud import bigquery + from bigframes.testing import mocks @@ -29,32 +33,42 @@ def test_read_gbq_colab_includes_label(): continue label_values.extend(config.labels.values()) - assert "read_gbq_colab" in label_values + assert "session-read_gbq_colab" in label_values -def test_read_gbq_colab_includes_formatted_values_in_dry_run(): +def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch): session = mocks.create_bigquery_session() + bf_df = mocks.create_dataframe(monkeypatch, session=session) + bf_df._to_view = lambda: bigquery.TableReference.from_string("my-project.my_dataset.some_view") # type: ignore pyformat_args = { "some_integer": 123, "some_string": "This could be dangerous, but we escape it", + "bf_df": bf_df, # This is not a supported type, but ignored if not referenced. "some_object": object(), } + _ = session._read_gbq_colab( - """ - SELECT {some_integer} as some_integer, - {some_string} as some_string, - '{{escaped}}' as escaped - """, + textwrap.dedent( + """ + SELECT {some_integer} as some_integer, + {some_string} as some_string, + '{{escaped}}' as escaped + FROM {bf_df} + """ + ), pyformat_args=pyformat_args, dry_run=True, ) - expected = """ + expected = textwrap.dedent( + """ SELECT 123 as some_integer, 'This could be dangerous, but we escape it' as some_string, '{escaped}' as escaped + FROM `my-project`.`my_dataset`.`some_view` """ + ) queries = session._queries # type: ignore configs = session._job_configs # type: ignore diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index a56b4ed7ab..6a4ae7cb60 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -91,6 +91,6 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte ) table._properties["location"] = session._location - result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols, "") + result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols) assert result == expected diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 91b6679702..cbd31f588a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -108,11 +108,6 @@ @pytest.mark.parametrize( ("kwargs", "match"), [ - pytest.param( - {"engine": "bigquery", "dtype": {}}, - "BigQuery engine does not support the `dtype` argument", - id="with_dtype", - ), pytest.param( {"engine": "bigquery", "usecols": [1, 2]}, "BigQuery engine only supports an iterable of strings for `usecols`.", @@ -215,6 +210,17 @@ def test_read_csv_w_bigquery_engine_raises_error_for_invalid_names( session.read_csv("path/to/csv.csv", engine="bigquery", names=names) +def test_read_csv_w_bigquery_engine_raises_error_for_invalid_dtypes(): + session = mocks.create_bigquery_session() + + with pytest.raises(ValueError, match="dtype should be a dict-like object."): + session.read_csv( + "path/to/csv.csv", + engine="bigquery", + dtype=["a", "b", "c"], # type: ignore[arg-type] + ) + + @pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")]) def test_read_gbq_missing_parts(missing_parts_table_id): session = mocks.create_bigquery_session() @@ -484,7 +490,7 @@ def today(cls): mocks.create_bigquery_session() -@mock.patch("bigframes.session.MAX_INLINE_DF_BYTES", 1) +@mock.patch("bigframes.constants.MAX_INLINE_BYTES", 1) def test_read_pandas_inline_exceeds_limit_raises_error(): session = mocks.create_bigquery_session() pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 9d67fd33b7..d630380e7a 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -90,6 +90,45 @@ def test_dataframe_to_gbq_writes_to_anonymous_dataset( assert destination.startswith(anonymous_dataset_id) +def test_dataframe_rename_columns(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"col1": [], "col2": [], "col3": []} + ) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + renamed = dataframe.rename(columns={"col1": "a", "col2": "b", "col3": "c"}) + assert renamed.columns.to_list() == ["a", "b", "c"] + + +def test_dataframe_rename_columns_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"col1": [], "col2": [], "col3": []} + ) + assert dataframe.columns.to_list() == ["col1", "col2", "col3"] + assert ( + dataframe.rename(columns={"col1": "a", "col2": "b", "col3": "c"}, inplace=True) + is None + ) + assert dataframe.columns.to_list() == ["a", "b", "c"] + + +def test_dataframe_rename_axis(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"]) + assert list(dataframe.index.names) == ["index1", "index2"] + renamed = dataframe.rename_axis(["a", "b"]) + assert list(renamed.index.names) == ["a", "b"] + + +def test_dataframe_rename_axis_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"]) + assert list(dataframe.index.names) == ["index1", "index2"] + assert dataframe.rename_axis(["a", "b"], inplace=True) is None + assert list(dataframe.index.names) == ["a", "b"] + + def test_dataframe_semantics_property_future_warning( monkeypatch: pytest.MonkeyPatch, ): diff --git a/tests/unit/test_dataframe_io.py b/tests/unit/test_dataframe_io.py index 7845a71134..f2c0241396 100644 --- a/tests/unit/test_dataframe_io.py +++ b/tests/unit/test_dataframe_io.py @@ -49,3 +49,8 @@ def test_dataframe_to_pandas(mock_df, api_name, kwargs): mock_df.to_pandas.assert_called_once_with( allow_large_results=kwargs["allow_large_results"] ) + + +def test_to_gbq_if_exists_invalid(mock_df): + with pytest.raises(ValueError, match="Got invalid value 'invalid' for if_exists."): + mock_df.to_gbq("a.b.c", if_exists="invalid") diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py new file mode 100644 index 0000000000..97f1e4419e --- /dev/null +++ b/tests/unit/test_index.py @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.testing import mocks + + +def test_index_rename(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"idx": [], "col": []} + ).set_index("idx") + index = dataframe.index + assert index.name == "idx" + renamed = index.rename("my_index_name") + assert renamed.name == "my_index_name" + + +def test_index_rename_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + dataframe = mocks.create_dataframe( + monkeypatch, data={"idx": [], "col": []} + ).set_index("idx") + index = dataframe.index + assert index.name == "idx" + assert index.rename("my_index_name", inplace=True) is None + + # Make sure the linked DataFrame is updated, too. + assert dataframe.index.name == "my_index_name" + assert index.name == "my_index_name" diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py index bb7330aba4..71479e89d4 100644 --- a/tests/unit/test_local_data.py +++ b/tests/unit/test_local_data.py @@ -64,3 +64,16 @@ def test_local_data_well_formed_round_trip_sliced(): result.reset_index(drop=True), check_dtype=False, ) + + +def test_local_data_equal_self(): + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + assert local_entry == local_entry + assert hash(local_entry) == hash(local_entry) + + +def test_local_data_not_equal_other(): + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + local_entry2 = local_data.ManagedArrowTable.from_pandas(pd_data[::2]) + assert local_entry != local_entry2 + assert hash(local_entry) != hash(local_entry2) diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index b4672d07a9..509bc6ade2 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -27,7 +27,7 @@ # All tests in this file require polars to be installed to pass. @pytest.fixture(scope="module") def polars_session(): - from . import polars_session + from bigframes.testing import polars_session return polars_session.TestSession() @@ -79,6 +79,33 @@ def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_ses pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_polars_local_engine_series_rename_with_mapping(polars_session): + pd_series = pd.Series( + ["a", "b", "c"], index=[1, 2, 3], dtype="string[pyarrow]", name="test_name" + ) + bf_series = bpd.Series(pd_series, session=polars_session) + + bf_result = bf_series.rename({1: 100, 2: 200, 3: 300}).to_pandas() + pd_result = pd_series.rename({1: 100, 2: 200, 3: 300}) + # pd default index is int64, bf is Int64 + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_polars_local_engine_series_rename_with_mapping_inplace(polars_session): + pd_series = pd.Series( + ["a", "b", "c"], index=[1, 2, 3], dtype="string[pyarrow]", name="test_name" + ) + bf_series = bpd.Series(pd_series, session=polars_session) + + pd_series.rename({1: 100, 2: 200, 3: 300}, inplace=True) + assert bf_series.rename({1: 100, 2: 200, 3: 300}, inplace=True) is None + + bf_result = bf_series.to_pandas() + pd_result = pd_series + # pd default index is int64, bf is Int64 + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + def test_polars_local_engine_reset_index( small_inline_frame: pd.DataFrame, polars_session ): diff --git a/tests/unit/test_series.py b/tests/unit/test_series.py index 1409209c6c..8a083d7e4a 100644 --- a/tests/unit/test_series.py +++ b/tests/unit/test_series.py @@ -12,7 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import cast + +import pytest + import bigframes.series +from bigframes.testing import mocks + + +def test_series_rename(monkeypatch: pytest.MonkeyPatch): + series = cast(bigframes.series.Series, mocks.create_dataframe(monkeypatch)["col"]) + assert series.name == "col" + renamed = series.rename("renamed_col") + assert renamed.name == "renamed_col" + + +def test_series_rename_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + series = cast(bigframes.series.Series, mocks.create_dataframe(monkeypatch)["col"]) + assert series.name == "col" + assert series.rename("renamed_col", inplace=True) is None + assert series.name == "renamed_col" + + +def test_series_rename_axis(monkeypatch: pytest.MonkeyPatch): + series = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"])["col1"] + assert list(series.index.names) == ["index1", "index2"] + renamed = series.rename_axis(["a", "b"]) + assert list(renamed.index.names) == ["a", "b"] + + +def test_series_rename_axis_inplace_returns_none(monkeypatch: pytest.MonkeyPatch): + series = mocks.create_dataframe( + monkeypatch, data={"index1": [], "index2": [], "col1": [], "col2": []} + ).set_index(["index1", "index2"])["col1"] + assert list(series.index.names) == ["index1", "index2"] + assert series.rename_axis(["a", "b"], inplace=True) is None + assert list(series.index.names) == ["a", "b"] def test_series_repr_with_uninitialized_object(): diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index af87694cd5..6d55817a27 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -52,5 +52,6 @@ "bigquery_load", "bigquery_streaming", "bigquery_write", + "_deferred", ] VALID_WRITE_ENGINES = typing.get_args(WriteEngineType) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py index 5b4e4d85a1..fba0339ae9 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py @@ -53,6 +53,8 @@ def from_ibis(cls, dtype: dt.DataType) -> str: ) elif dtype.is_integer(): return "INT64" + elif dtype.is_boolean(): + return "BOOLEAN" elif dtype.is_binary(): return "BYTES" elif dtype.is_string(): diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index 6e98d6a9e1..acccd7ea6c 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -1222,6 +1222,12 @@ def __sql_name__(self, op: ops.ScalarUDF | ops.AggUDF) -> str: # not actually a table, but easier to quote individual namespace # components this way namespace = op.__udf_namespace__ + + # Function names prefixed with "SAFE.", such as `SAFE.PARSE_JSON`, + # are typically not quoted. + if funcname.startswith("SAFE."): + return funcname + return sg.table(funcname, db=namespace.database, catalog=namespace.catalog).sql( self.dialect ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 5bbf72b421..c1b5b5a86b 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,12 +11,13 @@ """ from __future__ import annotations -from typing import Hashable, Iterable, Literal, Mapping, Optional, Sequence, Union +from typing import Hashable, Iterable, Literal, Optional, Sequence, Union from bigframes_vendored import constants import bigframes_vendored.pandas.core.generic as generic import numpy as np import pandas as pd +from pandas.api import extensions as pd_ext # ----------------------------------------------------------------------- # DataFrame class @@ -369,7 +370,7 @@ def to_numpy( self, dtype=None, copy=False, - na_value=None, + na_value=pd_ext.no_default, *, allow_large_results=None, **kwargs, @@ -1391,8 +1392,9 @@ def align( def rename( self, *, - columns: Mapping, - ) -> DataFrame: + columns, + inplace, + ): """Rename columns. Dict values must be unique (1-to-1). Labels not contained in a dict @@ -1425,16 +1427,20 @@ def rename( Args: columns (Mapping): Dict-like from old column labels to new column labels. + inplace (bool): + Default False. Whether to modify the DataFrame rather than + creating a new one. Returns: - bigframes.pandas.DataFrame: DataFrame with the renamed axis labels. + bigframes.pandas.DataFrame | None: + DataFrame with the renamed axis labels or None if ``inplace=True``. Raises: KeyError: If any of the labels is not found. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: + def rename_axis(self, mapper, *, inplace, **kwargs): """ Set the name of the axis for the index. @@ -1442,11 +1448,15 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Currently only accepts a single string parameter (the new name of the index). Args: - mapper str: + mapper (str): Value to set the axis name attribute. + inplace (bool): + Default False. Modifies the object directly, instead of + creating a new Series or DataFrame. Returns: - bigframes.pandas.DataFrame: DataFrame with the new index name + bigframes.pandas.DataFrame | None: + DataFrame with the new index name or None if ``inplace=True``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4788,6 +4798,83 @@ def merge( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def round(self, decimals): + """ + Round a DataFrame to a variable number of decimal places. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats']) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + [4 rows x 2 columns] + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + [4 rows x 2 columns] + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + [4 rows x 2 columns] + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + [4 rows x 2 columns] + + Args: + decimals (int, dict, Series): + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + + Returns: + bigframes.pandas.DataFrame: + A DataFrame with the affected columns rounded to the specified + number of decimal places. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def apply(self, func, *, axis=0, args=(), **kwargs): """Apply a function along an axis of the DataFrame. diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index c94f707671..7df1c7a9de 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,6 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py from __future__ import annotations +from collections.abc import Hashable import typing from bigframes import constants @@ -940,7 +941,7 @@ def fillna(self, value) -> Index: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename(self, name) -> Index: + def rename(self, name, *, inplace): """ Alter Index or MultiIndex name. @@ -959,10 +960,13 @@ def rename(self, name) -> Index: Args: name (label or list of labels): Name(s) to set. + inplace (bool): + Default False. Modifies the object directly, instead of + creating a new Index or MultiIndex. Returns: - bigframes.pandas.Index: - The same type as the caller. + bigframes.pandas.Index | None: + The same type as the caller or None if ``inplace=True``. Raises: ValueError: @@ -1061,6 +1065,28 @@ def drop_duplicates(self, *, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unique(self, level: Hashable | int | None = None): + """ + Returns unique values in the index. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 1, 2, 3, 3]) + >>> idx.unique() + Index([1, 2, 3], dtype='Int64') + + Args: + level (int or hashable, optional): + Only return values from specified level (for MultiIndex). + If int, gets the level by integer position, else by level name. + + Returns: + bigframes.pandas.Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8164fa7415..61cd6a47bf 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -19,8 +19,8 @@ from bigframes_vendored.pandas.core.generic import NDFrame import numpy import numpy as np -from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer +from pandas.api import extensions as pd_ext from bigframes import constants @@ -323,7 +323,7 @@ def reset_index( self, *, drop: bool = False, - name=lib.no_default, + name=pd_ext.no_default, ) -> DataFrame | Series | None: """ Generate a new DataFrame or Series with the index reset. @@ -730,7 +730,9 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: to_list = tolist - def to_numpy(self, dtype, copy=False, na_value=None, *, allow_large_results=None): + def to_numpy( + self, dtype, copy=False, na_value=pd_ext.no_default, *, allow_large_results=None + ): """ A NumPy ndarray representing the values in this Series or Index. @@ -5255,7 +5257,7 @@ def argmin(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename(self, index, **kwargs) -> Series | None: + def rename(self, index, *, inplace, **kwargs): """ Alter Series index labels or name. @@ -5299,15 +5301,17 @@ def rename(self, index, **kwargs) -> Series | None: the index. Scalar or hashable sequence-like will alter the ``Series.name`` attribute. + inplace (bool): + Default False. Whether to return a new Series. Returns: - bigframes.pandas.Series: - Series with index labels. + bigframes.pandas.Series | None: + Series with index labels or None if ``inplace=True``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rename_axis(self, mapper, **kwargs): + def rename_axis(self, mapper, *, inplace, **kwargs): """ Set the name of the axis for the index or columns. diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index fb29cc8984..c3c3a77b71 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -73,11 +73,13 @@ def score(self, X=None, y=None): for the outputs relevant to this model type. Args: - X (default None): - Ignored. + X (bigframes.dataframe.DataFrame | bigframes.series.Series | None): + DataFrame of shape (n_samples, n_features). Test samples. + + y (bigframes.dataframe.DataFrame | bigframes.series.Series | None): + DataFrame of shape (n_samples,) or (n_samples, n_outputs). True + labels for `X`. - y (default None): - Ignored. Returns: bigframes.dataframe.DataFrame: DataFrame that represents model metrics. """ diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 3a34d3d7bb..6cc3d952ed 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.4.0" +__version__ = "2.5.0" # {x-release-please-start-date} -__release_date__ = "2025-05-12" +__release_date__ = "2025-05-30" # {x-release-please-end}