diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 3983728ba1..51b21a62b7 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:5581906b957284864632cde4e9c51d1cc66b0094990b27e689132fe5cd036046 -# created: 2025-01-16T15:24:11.364245182Z + digest: sha256:a7aef70df5f13313ddc027409fc8f3151422ec2a57ac8730fce8fa75c060d5bb +# created: 2025-04-10T17:00:10.042601326Z diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index 37874a6888..80bfd5f951 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -20,6 +20,23 @@ branchProtectionRules: - 'cover' - 'Kokoro presubmit' - 'Kokoro windows' +- pattern: v1 + requiresCodeOwnerReviews: true + requiresStrictStatusChecks: false + requiredStatusCheckContexts: + - 'OwlBot Post Processor' + - 'conventionalcommits.org' + - 'cla/google' + - 'docs' + - 'lint' + - 'mypy' + - 'unit (3.9)' + - 'unit (3.10)' + - 'unit (3.11)' + - 'unit (3.12)' + - 'cover' + - 'Kokoro presubmit' + - 'Kokoro windows' permissionRules: - team: actools-python permission: admin diff --git a/.gitignore b/.gitignore index d083ea1ddc..f7c77e4d3e 100644 --- a/.gitignore +++ b/.gitignore @@ -60,5 +60,6 @@ coverage.xml system_tests/local_test_setup # Make sure a generated file isn't accidentally committed. +demo.ipynb pylintrc pylintrc.test diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 7da0881bbe..124e4b8b48 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -57,8 +57,7 @@ git config --global --add safe.directory "${PROJECT_ROOT}" # Workaround for older pip not able to resolve dependencies. See internal # issue 316909553. -python3.10 -m pip install pip==23.3.2 -python3.10 -m pip install --require-hashes -r .kokoro/requirements.txt +python3.10 -m pip install pip==25.0.1 # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8ca120bd07..863a345da1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,11 +31,11 @@ repos: hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 6.1.0 + rev: 7.1.2 hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 + rev: v1.15.0 hooks: - id: mypy additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126] diff --git a/CHANGELOG.md b/CHANGELOG.md index bebe139c72..667273167b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,86 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.0.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.42.0...v2.0.0) (2025-04-17) + + +### ⚠ BREAKING CHANGES + +* make `dataset` and `name` params mandatory in `udf` ([#1619](https://github.com/googleapis/python-bigquery-dataframes/issues/1619)) +* Locational endpoints support is not available in BigFrames 2.0. +* change default LLM model to gemini-2.0-flash-001, drop PaLM2TextGenerator and PaLM2TextEmbeddingGenerator ([#1558](https://github.com/googleapis/python-bigquery-dataframes/issues/1558)) +* change default ingress setting for `remote_function` to internal-only ([#1544](https://github.com/googleapis/python-bigquery-dataframes/issues/1544)) +* make `remote_function` params keyword only ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) +* make `remote_function` default service account explicit ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) +* set `allow_large_results=False` by default ([#1541](https://github.com/googleapis/python-bigquery-dataframes/issues/1541)) + +### Features + +* Add `on` parameter in `dataframe.rolling()` and `dataframe.groupby.rolling()` ([#1556](https://github.com/googleapis/python-bigquery-dataframes/issues/1556)) ([45c9d9f](https://github.com/googleapis/python-bigquery-dataframes/commit/45c9d9fd1c5c13a8692435aa22861820fc11e347)) +* Add component to manage temporary tables ([#1559](https://github.com/googleapis/python-bigquery-dataframes/issues/1559)) ([0a4e245](https://github.com/googleapis/python-bigquery-dataframes/commit/0a4e245670e678f4ead0aec8f8b534e7fe97d112)) +* Add Series.to_pandas_batches() method ([#1592](https://github.com/googleapis/python-bigquery-dataframes/issues/1592)) ([09ce979](https://github.com/googleapis/python-bigquery-dataframes/commit/09ce97999cfc1ded72906b1c7307da5950978ae6)) +* Add support for creating a Matrix Factorization model ([#1330](https://github.com/googleapis/python-bigquery-dataframes/issues/1330)) ([b5297f9](https://github.com/googleapis/python-bigquery-dataframes/commit/b5297f909b08928b97d887764d6e5142c763a5a3)) +* Allow `input_types`, `output_type`, and `dataset` to be used positionally in `remote_function` ([#1560](https://github.com/googleapis/python-bigquery-dataframes/issues/1560)) ([bcac8c6](https://github.com/googleapis/python-bigquery-dataframes/commit/bcac8c6ed0b40902d0ccaef3f907e6acbe6a52ed)) +* Allow pandas.cut 'labels' parameter to accept a list of string ([#1549](https://github.com/googleapis/python-bigquery-dataframes/issues/1549)) ([af842b1](https://github.com/googleapis/python-bigquery-dataframes/commit/af842b174de7eef4908b397d6a745caf8eda7b3d)) +* Change default ingress setting for `remote_function` to internal-only ([#1544](https://github.com/googleapis/python-bigquery-dataframes/issues/1544)) ([c848a80](https://github.com/googleapis/python-bigquery-dataframes/commit/c848a80766ff68ea92c05a5dc5c26508e6755381)) +* Detect duplicate column/index names in read_gbq before send query. ([#1615](https://github.com/googleapis/python-bigquery-dataframes/issues/1615)) ([40d6960](https://github.com/googleapis/python-bigquery-dataframes/commit/40d696088114fb08e68df74be261144350b785c8)) +* Drop support for locational endpoints ([#1542](https://github.com/googleapis/python-bigquery-dataframes/issues/1542)) ([4bf2e43](https://github.com/googleapis/python-bigquery-dataframes/commit/4bf2e43ef4498b11f32086231fc4cc749fde966a)) +* Enable time range rolling for DataFrame, DataFrameGroupBy and SeriesGroupBy ([#1605](https://github.com/googleapis/python-bigquery-dataframes/issues/1605)) ([b4b7073](https://github.com/googleapis/python-bigquery-dataframes/commit/b4b7073da8348b6597bd3d90d1a758cd29586533)) +* Improve local data validation ([#1598](https://github.com/googleapis/python-bigquery-dataframes/issues/1598)) ([815e471](https://github.com/googleapis/python-bigquery-dataframes/commit/815e471b904d4bd708afc4bfbf1db945e76f75c9)) +* Make `remote_function` default service account explicit ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) ([9eb9089](https://github.com/googleapis/python-bigquery-dataframes/commit/9eb9089ce3f1dad39761ba8ebc2d6f76261bd243)) +* Set `allow_large_results=False` by default ([#1541](https://github.com/googleapis/python-bigquery-dataframes/issues/1541)) ([e9fb712](https://github.com/googleapis/python-bigquery-dataframes/commit/e9fb7129a05e8ac7c938ffe30e86902950316f20)) +* Support bigquery connection in managed function ([#1554](https://github.com/googleapis/python-bigquery-dataframes/issues/1554)) ([f6f697a](https://github.com/googleapis/python-bigquery-dataframes/commit/f6f697afc167e0fa7ea923c0aed85a9ef257d61f)) +* Support bq connection path format ([#1550](https://github.com/googleapis/python-bigquery-dataframes/issues/1550)) ([e7eb918](https://github.com/googleapis/python-bigquery-dataframes/commit/e7eb918dd9df3569febe695f57c1a5909844fd3c)) +* Support gemini-2.0-X models ([#1558](https://github.com/googleapis/python-bigquery-dataframes/issues/1558)) ([3104fab](https://github.com/googleapis/python-bigquery-dataframes/commit/3104fab019d20b0cbc06cd81d43b3f34fd1dd987)) +* Support inlining small list, struct, json data ([#1589](https://github.com/googleapis/python-bigquery-dataframes/issues/1589)) ([2ce891f](https://github.com/googleapis/python-bigquery-dataframes/commit/2ce891fcd5bfd9f093fbcbb1ea35158d2bf9d8b9)) +* Support time range rolling on Series. ([#1590](https://github.com/googleapis/python-bigquery-dataframes/issues/1590)) ([6e98a2c](https://github.com/googleapis/python-bigquery-dataframes/commit/6e98a2cf53dd130963a9c5ba07e21ce6c32b7c6d)) +* Use session temp tables for all ephemeral storage ([#1569](https://github.com/googleapis/python-bigquery-dataframes/issues/1569)) ([9711b83](https://github.com/googleapis/python-bigquery-dataframes/commit/9711b830a7bdc6740f4ebeaaab6f37082ae5dfd9)) +* Use validated local storage for data uploads ([#1612](https://github.com/googleapis/python-bigquery-dataframes/issues/1612)) ([aee4159](https://github.com/googleapis/python-bigquery-dataframes/commit/aee4159807401d7432bb8c0c41859ada3291599b)) +* Warn the deprecated `max_download_size`, `random_state` and `sampling_method` parameters in `(DataFrame|Series).to_pandas()` ([#1573](https://github.com/googleapis/python-bigquery-dataframes/issues/1573)) ([b9623da](https://github.com/googleapis/python-bigquery-dataframes/commit/b9623daa847805abf420f0f11e173674fb147193)) + + +### Bug Fixes + +* `to_pandas_batches()` respects `page_size` and `max_results` again ([#1572](https://github.com/googleapis/python-bigquery-dataframes/issues/1572)) ([27c5905](https://github.com/googleapis/python-bigquery-dataframes/commit/27c59051549b83fdac954eaa3d257803c6f9133d)) +* Ensure `page_size` works correctly in `to_pandas_batches` when `max_results` is not set ([#1588](https://github.com/googleapis/python-bigquery-dataframes/issues/1588)) ([570cff3](https://github.com/googleapis/python-bigquery-dataframes/commit/570cff3c2efe3a47535bb3c931a345856d256a19)) +* Include role and service account in IAM exception ([#1564](https://github.com/googleapis/python-bigquery-dataframes/issues/1564)) ([8c50755](https://github.com/googleapis/python-bigquery-dataframes/commit/8c507556c5f61fab95c6389a8ad04d731df1df7b)) +* Make `dataset` and `name` params mandatory in `udf` ([#1619](https://github.com/googleapis/python-bigquery-dataframes/issues/1619)) ([637e860](https://github.com/googleapis/python-bigquery-dataframes/commit/637e860d3cea0a36b1e58a45ec9b9ab0059fb3b1)) +* Pandas.cut returns labels index for numeric breaks when labels=False ([#1548](https://github.com/googleapis/python-bigquery-dataframes/issues/1548)) ([b2375de](https://github.com/googleapis/python-bigquery-dataframes/commit/b2375decedbf1a793eedbbc9dc2efc2296f8cc6e)) +* Prevent `KeyError` in `bpd.concat` with empty DF and struct/array types DF ([#1568](https://github.com/googleapis/python-bigquery-dataframes/issues/1568)) ([b4da1cf](https://github.com/googleapis/python-bigquery-dataframes/commit/b4da1cf3c0fb94a2bb21e6039896accab85742d4)) +* Read_csv supports for tilde local paths and includes index for bigquery_stream write engine ([#1580](https://github.com/googleapis/python-bigquery-dataframes/issues/1580)) ([352e8e4](https://github.com/googleapis/python-bigquery-dataframes/commit/352e8e4b05cf19e970b47b017f958a1c6fc89bea)) +* Use dictionaries to avoid problematic google.iam namespace ([#1611](https://github.com/googleapis/python-bigquery-dataframes/issues/1611)) ([b03e44f](https://github.com/googleapis/python-bigquery-dataframes/commit/b03e44f7fca429a6de41c42ec28504b688cd84f0)) + + +### Performance Improvements + +* Directly read gbq table for simple plans ([#1607](https://github.com/googleapis/python-bigquery-dataframes/issues/1607)) ([6ad38e8](https://github.com/googleapis/python-bigquery-dataframes/commit/6ad38e8287354f62b0c5cad1f3d5b897256860ca)) + + +### Dependencies + +* Remove jellyfish dependency ([#1604](https://github.com/googleapis/python-bigquery-dataframes/issues/1604)) ([1ac0e1e](https://github.com/googleapis/python-bigquery-dataframes/commit/1ac0e1e82c097717338a6816f27c01b67736f51c)) +* Remove parsy dependency ([#1610](https://github.com/googleapis/python-bigquery-dataframes/issues/1610)) ([293f676](https://github.com/googleapis/python-bigquery-dataframes/commit/293f676e98446c417c12c345d5db875dd4c438df)) +* Remove test dependency on pytest-mock package ([#1622](https://github.com/googleapis/python-bigquery-dataframes/issues/1622)) ([1ba72ea](https://github.com/googleapis/python-bigquery-dataframes/commit/1ba72ead256178afee6f1d3303b0556bec1c4a9b)) +* Support a shapely versions 1.8.5+ ([#1621](https://github.com/googleapis/python-bigquery-dataframes/issues/1621)) ([e39ee3b](https://github.com/googleapis/python-bigquery-dataframes/commit/e39ee3bcf37f2a4f5e6ce981d248c24c6f5d770b)) + + +### Documentation + +* Add details for `bigquery_connection` in `[@bpd](https://github.com/bpd).udf` docstring ([#1609](https://github.com/googleapis/python-bigquery-dataframes/issues/1609)) ([ef63772](https://github.com/googleapis/python-bigquery-dataframes/commit/ef6377277bc9c354385c83ceba9e00094c0a6cc6)) +* Add explain forecast snippet to multiple time series tutorial ([#1586](https://github.com/googleapis/python-bigquery-dataframes/issues/1586)) ([40c55a0](https://github.com/googleapis/python-bigquery-dataframes/commit/40c55a06a529ca49d203227ccf36c12427d0cd5b)) +* Add message to remove default model for version 3.0 ([#1563](https://github.com/googleapis/python-bigquery-dataframes/issues/1563)) ([910be2b](https://github.com/googleapis/python-bigquery-dataframes/commit/910be2b5b2bfaf0e21cdc4fd775c1605a864c1aa)) +* Add samples for ArimaPlus `time_series_id_col` feature ([#1577](https://github.com/googleapis/python-bigquery-dataframes/issues/1577)) ([1e4cd9c](https://github.com/googleapis/python-bigquery-dataframes/commit/1e4cd9cf69f98d4af6b2a70bd8189c619b19baaa)) +* Add warning for bigframes 2.0 ([#1557](https://github.com/googleapis/python-bigquery-dataframes/issues/1557)) ([3f0eaa1](https://github.com/googleapis/python-bigquery-dataframes/commit/3f0eaa1c6b02d086270421f91dbb6aa2f117317d)) +* Deprecate default model in `TextEmbedddingGenerator`, `GeminiTextGenerator`, and other `bigframes.ml.llm` classes ([#1570](https://github.com/googleapis/python-bigquery-dataframes/issues/1570)) ([89ab33e](https://github.com/googleapis/python-bigquery-dataframes/commit/89ab33e1179aef142415fd5c9073671903bf1d45)) +* Include all licenses for vendored packages in the root LICENSE file ([#1626](https://github.com/googleapis/python-bigquery-dataframes/issues/1626)) ([8116ed0](https://github.com/googleapis/python-bigquery-dataframes/commit/8116ed0938634d301a153613f8a9cd8053ddf026)) +* Remove gemini-1.5 deprecation warning for `GeminiTextGenerator` ([#1562](https://github.com/googleapis/python-bigquery-dataframes/issues/1562)) ([0cc6784](https://github.com/googleapis/python-bigquery-dataframes/commit/0cc678448fdec1eaa3acfbb563a018325a8c85bc)) +* Use restructured text to allow publishing to PyPI ([#1565](https://github.com/googleapis/python-bigquery-dataframes/issues/1565)) ([d1e9ec2](https://github.com/googleapis/python-bigquery-dataframes/commit/d1e9ec2936d270ec4035014ea3ddd335a5747ade)) + + +### Miscellaneous Chores + +* Make `remote_function` params keyword only ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) ([9eb9089](https://github.com/googleapis/python-bigquery-dataframes/commit/9eb9089ce3f1dad39761ba8ebc2d6f76261bd243)) + ## [1.42.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.41.0...v1.42.0) (2025-03-27) diff --git a/LICENSE b/LICENSE index d645695673..c7807337dc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,6 @@ +Files: All files not covered by another license. Notably: the bigframes module, +tests/*, bigframes_vendored.google_cloud_bigquery module, +bigframes_vendored.ibis module, and bigframes_vendored.xgboost module. Apache License Version 2.0, January 2004 @@ -200,3 +203,118 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +--- + +Files: For the bigframes_vendored.cpython module. + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 + +1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"), and the Individual or Organization ("Licensee") accessing and otherwise using this software ("Python") in source or binary form and its associated documentation. +2. Subject to the terms and conditions of this License Agreement, PSF hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use Python alone or in any derivative version, provided, however, that PSF's License Agreement and PSF's notice of copyright , i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation All Rights Reserved" are retained in Python alone or in any derivative version prepared by Licensee. +3. In the event Licensee prepares a derivative work that is based on or incorporates Python or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to Python. +4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. +6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. +7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between PSF and Licensee. This License Agreement does not grant permission to use PSF trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. +8. By copying, installing or otherwise using Python, Licensee agrees to be bound by the terms and conditions of this License Agreement. + +--- + +Files: for the bigframes_vendored.geopandas module. + +Copyright (c) 2013-2022, GeoPandas developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of GeoPandas nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +Files: The bigframes_vendored.pandas module. + +BSD 3-Clause License + +Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2011-2023, Open source contributors. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +Files: The bigframes_vendored.sklearn module. + +BSD 3-Clause License + +Copyright (c) 2007-2023 The scikit-learn developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.rst b/README.rst index 185c50c14a..7f487b9077 100644 --- a/README.rst +++ b/README.rst @@ -12,6 +12,31 @@ powered by the BigQuery engine. BigQuery DataFrames is an open-source package. You can run ``pip install --upgrade bigframes`` to install the latest version. +⚠️ Warning: Breaking Changes in BigQuery DataFrames v2.0 +-------------------------------------------------------- + +Version 2.0 introduces breaking changes for improved security and performance. Key default behaviors have changed, including + +* **Large Results (>10GB):** The default value for ``allow_large_results`` has changed to ``False``. + Methods like ``to_pandas()`` will now fail if the query result's compressed data size exceeds 10GB, + unless large results are explicitly permitted. +* **Remote Function Security:** The library no longer automatically lets the Compute Engine default service + account become the identity of the Cloud Run functions. If that is desired, it has to be indicated by passing + ``cloud_function_service_account="default"``. And network ingress now defaults to ``"internal-only"``. +* **@remote_function Argument Passing:** Arguments other than ``input_types``, ``output_type``, and ``dataset`` + to ``remote_function`` must now be passed using keyword syntax, as positional arguments are no longer supported. +* **@udf Argument Passing:** Arguments ``dataset`` and ``name`` to ``udf`` are now mandatory. +* **Endpoint Connections:** Automatic fallback to locational endpoints in certain regions is removed. +* **LLM Updates (Gemini Integration):** Integrations now default to the ``gemini-2.0-flash-001`` model. + PaLM2 support has been removed; please migrate any existing PaLM2 usage to Gemini. **Note:** The current default + model will be removed in Version 3.0. + +**Important:** If you are not ready to adapt to these changes, please pin your dependency to a version less than 2.0 +(e.g., ``bigframes==1.42.0``) to avoid disruption. + +To learn about these changes and how to migrate to version 2.0, see the +`updated introduction guide `_. + .. |GA| image:: https://img.shields.io/badge/support-GA-gold.svg :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability .. |pypi| image:: https://img.shields.io/pypi/v/bigframes.svg diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 84bc4f6d01..5155b09063 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -21,7 +21,6 @@ import google.api_core.exceptions import google.auth.credentials -import jellyfish import bigframes.constants import bigframes.enums @@ -37,6 +36,7 @@ def _get_validated_location(value: Optional[str]) -> Optional[str]: + import bigframes._tools.strings if value is None or value in bigframes.constants.ALL_BIGQUERY_LOCATIONS: return value @@ -53,7 +53,7 @@ def _get_validated_location(value: Optional[str]) -> Optional[str]: possibility = min( bigframes.constants.ALL_BIGQUERY_LOCATIONS, - key=lambda item: jellyfish.levenshtein_distance(location, item), + key=lambda item: bigframes._tools.strings.levenshtein_distance(location, item), ) # There are many layers before we get to (possibly) the user's code: # -> bpd.options.bigquery.location = "us-central-1" @@ -89,7 +89,7 @@ def __init__( kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, *, - allow_large_results: bool = True, + allow_large_results: bool = False, ordering_mode: Literal["strict", "partial"] = "strict", client_endpoints_override: Optional[dict] = None, ): @@ -258,7 +258,8 @@ def allow_large_results(self, value: bool): @property def use_regional_endpoints(self) -> bool: - """Flag to connect to regional API endpoints. + """Flag to connect to regional API endpoints for BigQuery API and + BigQuery Storage API. .. note:: Use of regional endpoints is a feature in Preview and available only @@ -267,18 +268,16 @@ def use_regional_endpoints(self) -> bool: "us-east5", "us-east7", "us-south1", "us-west1", "us-west2", "us-west3" and "us-west4". - .. deprecated:: 0.13.0 - Use of locational endpoints is available only in selected projects. - - Requires that ``location`` is set. For supported regions, for example - ``europe-west3``, you need to specify ``location='europe-west3'`` and - ``use_regional_endpoints=True``, and then BigQuery DataFrames would - connect to the BigQuery endpoint ``bigquery.europe-west3.rep.googleapis.com``. - For not supported regions, for example ``asia-northeast1``, when you - specify ``location='asia-northeast1'`` and ``use_regional_endpoints=True``, - a different endpoint (called locational endpoint, now deprecated, used - to provide weaker promise on the request remaining within the location - during transit) ``europe-west3-bigquery.googleapis.com`` would be used. + Requires that ``location`` is set. For [supported regions](https://cloud.google.com/bigquery/docs/regional-endpoints), + for example ``europe-west3``, you need to specify + ``location='europe-west3'`` and ``use_regional_endpoints=True``, and + then BigQuery DataFrames would connect to the BigQuery endpoint + ``bigquery.europe-west3.rep.googleapis.com``. For not supported regions, + for example ``asia-northeast1``, when you specify + ``location='asia-northeast1'`` and ``use_regional_endpoints=True``, + the global endpoint ``bigquery.googleapis.com`` would be used, which + does not promise any guarantee on the request remaining within the + location during transit. Returns: bool: diff --git a/bigframes/_tools/__init__.py b/bigframes/_tools/__init__.py new file mode 100644 index 0000000000..ea3bc209d0 --- /dev/null +++ b/bigframes/_tools/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""_tools is a collection of helper functions with minimal dependencies. + +Please keep the dependencies used in this subpackage to a minimum to avoid the +risk of circular dependencies. +""" diff --git a/bigframes/_tools/strings.py b/bigframes/_tools/strings.py new file mode 100644 index 0000000000..3d9402c68f --- /dev/null +++ b/bigframes/_tools/strings.py @@ -0,0 +1,66 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper methods for processing strings with minimal dependencies. + +Please keep the dependencies used in this subpackage to a minimum to avoid the +risk of circular dependencies. +""" + +import numpy + + +def levenshtein_distance(left: str, right: str) -> int: + """Compute the edit distance between two strings. + + This is the minumum number of substitutions, insertions, deletions + to get from left string to right string. See: + https://en.wikipedia.org/wiki/Levenshtein_distance + """ + # TODO(tswast): accelerate with numba (if available) if we end up using this + # function in contexts other than when raising an exception or there are too + # many values to compare even in that context. + + distances0 = numpy.zeros(len(right) + 1) + distances1 = numpy.zeros(len(right) + 1) + + # Maximum distance is to drop all characters and then add the other string. + distances0[:] = range(len(right) + 1) + + for left_index in range(len(left)): + # Calculate distance from distances0 to distances1. + + # Edit distance is to delete (i + 1) chars from left to match empty right + distances1[0] = left_index + 1 + # "ab" + for right_index in range(len(right)): + left_char = left[left_index] + right_char = right[right_index] + + deletion_cost = distances0[right_index + 1] + 1 + insertion_cost = distances1[right_index] + 1 + if left_char == right_char: + substitution_cost = distances0[right_index] + else: + substitution_cost = distances0[right_index] + 1 + + distances1[right_index + 1] = min( + deletion_cost, insertion_cost, substitution_cost + ) + + temp = distances0 + distances0 = distances1 + distances1 = temp + + return distances0[len(right)] diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 6b9fa308d8..f7f035bff4 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -69,7 +69,7 @@ def _output_bq_type(self): def _create_udf(self): """Create Python UDF in BQ. Return name of the UDF.""" udf_name = str( - self._session._loader._storage_manager.generate_unique_resource_id() + self._session._anon_dataset_manager.generate_unique_resource_id() ) func_body = inspect.getsource(self._func) @@ -102,6 +102,9 @@ def _create_udf(self): def udf(self): """Create and return the UDF object.""" udf_name = self._create_udf() + + # TODO(b/404605969): remove cleanups when UDF fixes dataset deletion. + self._session._function_session._update_temp_artifacts(udf_name, "") return self._session.read_gbq_function(udf_name) diff --git a/bigframes/clients.py b/bigframes/clients.py index 1b8212377d..f1f6d686fd 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -17,32 +17,57 @@ from __future__ import annotations import logging +import textwrap import time from typing import cast, Optional import google.api_core.exceptions import google.api_core.retry from google.cloud import bigquery_connection_v1, resourcemanager_v3 -from google.iam.v1 import iam_policy_pb2, policy_pb2 logger = logging.getLogger(__name__) -def resolve_full_bq_connection_name( - connection_name: str, default_project: str, default_location: str +def get_canonical_bq_connection_id( + connection_id: str, default_project: str, default_location: str ) -> str: - """Retrieve the full connection name of the form ... - Use default project, location or connection_id when any of them are missing.""" - if connection_name.count(".") == 2: - return connection_name - - if connection_name.count(".") == 1: - return f"{default_project}.{connection_name}" - - if connection_name.count(".") == 0: - return f"{default_project}.{default_location}.{connection_name}" - - raise ValueError(f"Invalid connection name format: {connection_name}.") + """ + Retrieve the full connection id of the form + ... + Use default project, location or connection_id when any of them are missing. + """ + + if "/" in connection_id: + fields = connection_id.split("/") + if ( + len(fields) == 6 + and fields[0] == "projects" + and fields[2] == "locations" + and fields[4] == "connections" + ): + return ".".join((fields[1], fields[3], fields[5])) + else: + if connection_id.count(".") == 2: + return connection_id + + if connection_id.count(".") == 1: + return f"{default_project}.{connection_id}" + + if connection_id.count(".") == 0: + return f"{default_project}.{default_location}.{connection_id}" + + raise ValueError( + textwrap.dedent( + f""" + Invalid connection id format: {connection_id}. + Only the following formats are supported: + .., + ., + , + projects//locations//connections/ + """ + ).strip() + ) class BqConnectionManager: @@ -60,7 +85,11 @@ def __init__( self._cloud_resource_manager_client = cloud_resource_manager_client def create_bq_connection( - self, project_id: str, location: str, connection_id: str, iam_role: str + self, + project_id: str, + location: str, + connection_id: str, + iam_role: Optional[str] = None, ): """Create the BQ connection if not exist. In addition, try to add the IAM role to the connection to ensure required permissions. @@ -80,7 +109,7 @@ def create_bq_connection( ) if service_account_id: logger.info( - f"Connector {project_id}.{location}.{connection_id} already exists" + f"BQ connection {project_id}.{location}.{connection_id} already exists" ) else: connection_name, service_account_id = self._create_bq_connection( @@ -90,9 +119,15 @@ def create_bq_connection( f"Created BQ connection {connection_name} with service account id: {service_account_id}" ) service_account_id = cast(str, service_account_id) + # Ensure IAM role on the BQ connection # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - self._ensure_iam_binding(project_id, service_account_id, iam_role) + if iam_role: + try: + self._ensure_iam_binding(project_id, service_account_id, iam_role) + except google.api_core.exceptions.PermissionDenied as ex: + ex.message = f"Failed ensuring IAM binding (role={iam_role}, service-account={service_account_id}). {ex.message}" + raise # Introduce retries to accommodate transient errors like: # (1) Etag mismatch, @@ -125,7 +160,9 @@ def _ensure_iam_binding( project = f"projects/{project_id}" service_account = f"serviceAccount:{service_account_id}" role = f"roles/{iam_role}" - request = iam_policy_pb2.GetIamPolicyRequest(resource=project) + request = { + "resource": project + } # Use a dictionary to avoid problematic google.iam namespace package. policy = self._cloud_resource_manager_client.get_iam_policy(request=request) # Check if the binding already exists, and if does, do nothing more @@ -135,9 +172,15 @@ def _ensure_iam_binding( return # Create a new binding - new_binding = policy_pb2.Binding(role=role, members=[service_account]) + new_binding = { + "role": role, + "members": [service_account], + } # Use a dictionary to avoid problematic google.iam namespace package. policy.bindings.append(new_binding) - request = iam_policy_pb2.SetIamPolicyRequest(resource=project, policy=policy) + request = { + "resource": project, + "policy": policy, + } # Use a dictionary to avoid problematic google.iam namespace package. self._cloud_resource_manager_client.set_iam_policy(request=request) # We would wait for the IAM policy change to take effect diff --git a/bigframes/constants.py b/bigframes/constants.py index 8f5ed95e1a..89f27afd78 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -96,22 +96,27 @@ } ) -# https://cloud.google.com/storage/docs/locational-endpoints -LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( +REP_NOT_ENABLED_BIGQUERY_LOCATIONS = frozenset( ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS ) -LEP_DEPRECATION_WARNING_MESSAGE = textwrap.dedent( +LOCATION_NEEDED_FOR_REP_MESSAGE = textwrap.dedent( """ - Support for regional endpoints is not yet available in the location - {location} for BigQuery and BigQuery Storage APIs. For the supported - locations and APIs see https://cloud.google.com/bigquery/docs/regional-endpoints. - For other locations and APIs, currently an older, now deprecated locational - endpoints are being used, which requires your project to be allowlisted. In - future version 2.0 onwards the locational endpoints will no longer be - supported automatically when you enable regional endpoints. However, if you - still need them, you will be able to override the endpoints directly by - doing the following: + Must set location to use regional endpoints. + You can do it via bigframaes.pandas.options.bigquery.location. + The supported locations can be found at + https://cloud.google.com/bigquery/docs/regional-endpoints#supported-locations. + """ +).strip() + +REP_NOT_SUPPORTED_MESSAGE = textwrap.dedent( + """ + Support for regional endpoints for BigQuery and BigQuery Storage APIs may + not be available in the location {location}. For the supported APIs and + locations see https://cloud.google.com/bigquery/docs/regional-endpoints. + If you have the (deprecated) locational endpoints enabled in your project + (which requires your project to be allowlisted), you can override the + endpoints directly by doing the following: bigframes.pandas.options.bigquery.client_endpoints_override = {{ "bqclient": "https://{location}-bigquery.googleapis.com", "bqconnectionclient": "{location}-bigqueryconnection.googleapis.com", diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 7ede7b7e65..eba63ad72e 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -16,7 +16,6 @@ from dataclasses import dataclass import datetime import functools -import io import typing from typing import Iterable, List, Mapping, Optional, Sequence, Tuple import warnings @@ -24,7 +23,6 @@ import google.cloud.bigquery import pandas import pyarrow as pa -import pyarrow.feather as pa_feather import bigframes.core.expression as ex import bigframes.core.guid @@ -60,24 +58,20 @@ class ArrayValue: @classmethod def from_pyarrow(cls, arrow_table: pa.Table, session: Session): - adapted_table = local_data.adapt_pa_table(arrow_table) - schema = local_data.arrow_schema_to_bigframes(adapted_table.schema) + data_source = local_data.ManagedArrowTable.from_pyarrow(arrow_table) + return cls.from_managed(source=data_source, session=session) - iobytes = io.BytesIO() - pa_feather.write_feather(adapted_table, iobytes) - # Scan all columns by default, we define this list as it can be pruned while preserving source_def + @classmethod + def from_managed(cls, source: local_data.ManagedArrowTable, session: Session): scan_list = nodes.ScanList( tuple( nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) - for item in schema.items + for item in source.schema.items ) ) - node = nodes.ReadLocalNode( - iobytes.getvalue(), - data_schema=schema, + source, session=session, - n_rows=arrow_table.num_rows, scan_list=scan_list, ) return cls(node) @@ -103,6 +97,7 @@ def from_table( at_time: Optional[datetime.datetime] = None, primary_key: Sequence[str] = (), offsets_col: Optional[str] = None, + n_rows: Optional[int] = None, ): if offsets_col and primary_key: raise ValueError("must set at most one of 'offests', 'primary_key'") @@ -132,7 +127,11 @@ def from_table( ) ) source_def = nodes.BigqueryDataSource( - table=table_def, at_time=at_time, sql_predicate=predicate, ordering=ordering + table=table_def, + at_time=at_time, + sql_predicate=predicate, + ordering=ordering, + n_rows=n_rows, ) node = nodes.ReadTableNode( source=source_def, @@ -182,7 +181,9 @@ def as_cached( Replace the node with an equivalent one that references a table where the value has been materialized to. """ table = nodes.GbqTable.from_table(cache_table) - source = nodes.BigqueryDataSource(table, ordering=ordering) + source = nodes.BigqueryDataSource( + table, ordering=ordering, n_rows=cache_table.num_rows + ) # Assumption: GBQ cached table uses field name as bq column name scan_list = nodes.ScanList( tuple( @@ -412,7 +413,7 @@ def project_window_op( skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ # TODO: Support non-deterministic windowing - if window_spec.row_bounded or not op.order_independent: + if window_spec.is_row_bounded or not op.order_independent: if self.node.order_ambiguous and not self.session._strictly_ordered: if not self.session._allows_ambiguity: raise ValueError( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2992718412..c53f392417 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -53,6 +53,7 @@ from bigframes import session from bigframes._config import sampling_options import bigframes.constants +from bigframes.core import local_data import bigframes.core as core import bigframes.core.compile.googlesql as googlesql import bigframes.core.expression as ex @@ -187,8 +188,8 @@ def from_local( pd_data = pd_data.set_axis(column_ids, axis=1) pd_data = pd_data.reset_index(names=index_ids) - as_pyarrow = pa.Table.from_pandas(pd_data, preserve_index=False) - array_value = core.ArrayValue.from_pyarrow(as_pyarrow, session=session) + managed_data = local_data.ManagedArrowTable.from_pandas(pd_data) + array_value = core.ArrayValue.from_managed(managed_data, session=session) block = cls( array_value, column_labels=column_labels, @@ -590,6 +591,7 @@ def to_pandas_batches( page_size: Optional[int] = None, max_results: Optional[int] = None, allow_large_results: Optional[bool] = None, + squeeze: Optional[bool] = False, ): """Download results one message at a time. @@ -605,7 +607,10 @@ def to_pandas_batches( for record_batch in execute_result.arrow_batches(): df = io_pandas.arrow_to_pandas(record_batch, self.expr.schema) self._copy_index_to_pandas(df) - yield df + if squeeze: + yield df.squeeze(axis=1) + else: + yield df def _copy_index_to_pandas(self, df: pd.DataFrame): """Set the index on pandas DataFrame to match this block. @@ -987,7 +992,7 @@ def apply_nary_op( def multi_apply_window_op( self, columns: typing.Sequence[str], - op: agg_ops.WindowOp, + op: agg_ops.UnaryWindowOp, window_spec: windows.WindowSpec, *, skip_null_groups: bool = False, @@ -1058,7 +1063,7 @@ def project_exprs( def apply_window_op( self, column: str, - op: agg_ops.WindowOp, + op: agg_ops.UnaryWindowOp, window_spec: windows.WindowSpec, *, result_label: Label = None, @@ -2708,11 +2713,13 @@ def _get_rows_as_json_values(self) -> Block: ) ) + dest_table = self.session.bqclient.get_table(destination) expr = core.ArrayValue.from_table( - self.session.bqclient.get_table(destination), + dest_table, schema=new_schema, session=self.session, offsets_col=ordering_column_name, + n_rows=dest_table.num_rows, ).drop_columns([ordering_column_name]) block = Block( expr, diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index f96471e200..0d31798f25 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -360,69 +360,73 @@ def _( if isinstance(op.bins, int): col_min = _apply_window_if_present(x.min(), window) col_max = _apply_window_if_present(x.max(), window) + adj = (col_max - col_min) * 0.001 bin_width = (col_max - col_min) / op.bins - if op.labels is False: - for this_bin in range(op.bins - 1): - if op.right: - case_expr = x <= (col_min + (this_bin + 1) * bin_width) - else: - case_expr = x < (col_min + (this_bin + 1) * bin_width) - out = out.when( - case_expr, - compile_ibis_types.literal_to_ibis_scalar( - this_bin, force_dtype=pd.Int64Dtype() - ), + for this_bin in range(op.bins): + if op.labels is False: + value = compile_ibis_types.literal_to_ibis_scalar( + this_bin, + force_dtype=pd.Int64Dtype(), ) - out = out.when(x.notnull(), op.bins - 1) - else: - interval_struct = None - adj = (col_max - col_min) * 0.001 - for this_bin in range(op.bins): - left_edge_adj = adj if this_bin == 0 and op.right else 0 - right_edge_adj = adj if this_bin == op.bins - 1 and not op.right else 0 + elif isinstance(op.labels, typing.Iterable): + value = compile_ibis_types.literal_to_ibis_scalar( + list(op.labels)[this_bin], + force_dtype=pd.StringDtype(storage="pyarrow"), + ) + else: + left_adj = adj if this_bin == 0 and op.right else 0 + right_adj = adj if this_bin == op.bins - 1 and not op.right else 0 - left_edge = col_min + this_bin * bin_width - left_edge_adj - right_edge = col_min + (this_bin + 1) * bin_width + right_edge_adj + left = col_min + this_bin * bin_width - left_adj + right = col_min + (this_bin + 1) * bin_width + right_adj if op.right: - interval_struct = ibis_types.struct( - { - "left_exclusive": left_edge, - "right_inclusive": right_edge, - } + value = ibis_types.struct( + {"left_exclusive": left, "right_inclusive": right} ) else: - interval_struct = ibis_types.struct( - { - "left_inclusive": left_edge, - "right_exclusive": right_edge, - } + value = ibis_types.struct( + {"left_inclusive": left, "right_exclusive": right} ) - - if this_bin < op.bins - 1: - if op.right: - case_expr = x <= (col_min + (this_bin + 1) * bin_width) - else: - case_expr = x < (col_min + (this_bin + 1) * bin_width) - out = out.when(case_expr, interval_struct) + if this_bin == op.bins - 1: + case_expr = x.notnull() + else: + if op.right: + case_expr = x <= (col_min + (this_bin + 1) * bin_width) else: - out = out.when(x.notnull(), interval_struct) + case_expr = x < (col_min + (this_bin + 1) * bin_width) + out = out.when(case_expr, value) else: # Interpret as intervals - for interval in op.bins: + for this_bin, interval in enumerate(op.bins): left = compile_ibis_types.literal_to_ibis_scalar(interval[0]) right = compile_ibis_types.literal_to_ibis_scalar(interval[1]) if op.right: condition = (x > left) & (x <= right) - interval_struct = ibis_types.struct( - {"left_exclusive": left, "right_inclusive": right} - ) else: condition = (x >= left) & (x < right) - interval_struct = ibis_types.struct( - {"left_inclusive": left, "right_exclusive": right} + + if op.labels is False: + value = compile_ibis_types.literal_to_ibis_scalar( + this_bin, + force_dtype=pd.Int64Dtype(), ) - out = out.when(condition, interval_struct) + elif isinstance(op.labels, typing.Iterable): + value = compile_ibis_types.literal_to_ibis_scalar( + list(op.labels)[this_bin], + force_dtype=pd.StringDtype(storage="pyarrow"), + ) + else: + if op.right: + value = ibis_types.struct( + {"left_exclusive": left, "right_inclusive": right} + ) + else: + value = ibis_types.struct( + {"left_inclusive": left, "right_exclusive": right} + ) + + out = out.when(condition, value) return out.end() diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 4443c495d7..6202a34ce2 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -26,15 +26,16 @@ from bigframes_vendored.ibis.expr.operations import window as ibis_expr_window import bigframes_vendored.ibis.expr.operations as ibis_ops import bigframes_vendored.ibis.expr.types as ibis_types -import pandas +from google.cloud import bigquery +import pyarrow as pa +from bigframes.core import utils import bigframes.core.compile.aggregate_compiler as agg_compiler import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.compile.scalar_op_compiler as scalar_op_compiler import bigframes.core.expression as ex -import bigframes.core.guid from bigframes.core.ordering import OrderingExpression import bigframes.core.sql from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec @@ -231,7 +232,7 @@ def aggregate( col_out: agg_compiler.compile_aggregate( aggregate, bindings, - order_by=_convert_ordering_to_table_values(table, order_by), + order_by=_convert_row_ordering_to_table_values(table, order_by), ) for aggregate, col_out in aggregations } @@ -279,11 +280,8 @@ def _reproject_to_table(self) -> UnorderedIR: ) @classmethod - def from_pandas( - cls, - pd_df: pandas.DataFrame, - scan_cols: bigframes.core.nodes.ScanList, - offsets: typing.Optional[str] = None, + def from_polars( + cls, pa_table: pa.Table, schema: Sequence[bigquery.SchemaField] ) -> UnorderedIR: # TODO: add offsets """ @@ -292,37 +290,16 @@ def from_pandas( Assumed that the dataframe has unique string column names and bigframes-suppported dtypes. """ + import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes - # ibis memtable cannot handle NA, must convert to None - # this destroys the schema however - ibis_values = pd_df.astype("object").where(pandas.notnull(pd_df), None) # type: ignore - if offsets: - ibis_values = ibis_values.assign(**{offsets: range(len(pd_df))}) # derive the ibis schema from the original pandas schema - ibis_schema = [ - ( - local_label, - bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(dtype), - ) - for id, dtype, local_label in scan_cols.items - ] - if offsets: - ibis_schema.append((offsets, ibis_dtypes.int64)) - keys_memtable = bigframes_vendored.ibis.memtable( - ibis_values, schema=bigframes_vendored.ibis.schema(ibis_schema) + pa_table, + schema=third_party_ibis_bqtypes.BigQuerySchema.to_ibis(list(schema)), ) - - columns = [ - keys_memtable[local_label].name(col_id.sql) - for col_id, _, local_label in scan_cols.items - ] - if offsets: - columns.append(keys_memtable[offsets].name(offsets)) - return cls( keys_memtable, - columns=columns, + columns=tuple(keys_memtable[key] for key in keys_memtable.columns), ) def join( @@ -463,7 +440,7 @@ def project_window_op( never_skip_nulls=never_skip_nulls, ) - if expression.op.order_independent and not window_spec.row_bounded: + if expression.op.order_independent and window_spec.is_unbounded: # notably percentile_cont does not support ordering clause window_spec = window_spec.without_order() window = self._ibis_window_from_spec(window_spec) @@ -541,16 +518,30 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec): # 1. Order-independent op (aggregation, cut, rank) with unbound window - no ordering clause needed # 2. Order-independent op (aggregation, cut, rank) with range window - use ordering clause, ties allowed # 3. Order-depedenpent op (navigation functions, array_agg) or rows bounds - use total row order to break ties. - if window_spec.ordering: - order_by = _convert_ordering_to_table_values( + if window_spec.is_row_bounded: + if not window_spec.ordering: + # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. + raise ValueError("No ordering provided for ordered analytic function") + order_by = _convert_row_ordering_to_table_values( self._column_names, window_spec.ordering, ) - elif window_spec.row_bounded: - # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. - raise ValueError("No ordering provided for ordered analytic function") - else: + + elif window_spec.is_range_bounded: + order_by = [ + _convert_range_ordering_to_table_value( + self._column_names, + window_spec.ordering[0], + ) + ] + # The rest if branches are for unbounded windows + elif window_spec.ordering: # Unbound grouping window. Suitable for aggregations but not for analytic function application. + order_by = _convert_row_ordering_to_table_values( + self._column_names, + window_spec.ordering, + ) + else: order_by = None window = bigframes_vendored.ibis.window(order_by=order_by, group_by=group_by) @@ -575,7 +566,7 @@ def is_window(column: ibis_types.Value) -> bool: return any(isinstance(op, ibis_ops.WindowFunction) for op in matches) -def _convert_ordering_to_table_values( +def _convert_row_ordering_to_table_values( value_lookup: typing.Mapping[str, ibis_types.Value], ordering_columns: typing.Sequence[OrderingExpression], ) -> typing.Sequence[ibis_types.Value]: @@ -603,6 +594,30 @@ def _convert_ordering_to_table_values( return ordering_values +def _convert_range_ordering_to_table_value( + value_lookup: typing.Mapping[str, ibis_types.Value], + ordering_column: OrderingExpression, +) -> ibis_types.Value: + """Converts the ordering for range windows to Ibis references. + + Note that this method is different from `_convert_row_ordering_to_table_values` in + that it does not arrange null values. There are two reasons: + 1. Manipulating null positions requires more than one ordering key, which is forbidden + by SQL window syntax for range rolling. + 2. Pandas does not allow range rolling on timeseries with nulls. + + Therefore, we opt for the simplest approach here: generate the simplest SQL and follow + the BigQuery engine behavior. + """ + expr = op_compiler.compile_expression( + ordering_column.scalar_expression, value_lookup + ) + + if ordering_column.direction.is_ascending: + return bigframes_vendored.ibis.asc(expr) # type: ignore + return bigframes_vendored.ibis.desc(expr) # type: ignore + + def _string_cast_join_cond( lvalue: ibis_types.Column, rvalue: ibis_types.Column ) -> ibis_types.BooleanColumn: @@ -692,8 +707,14 @@ def _add_boundary( ) -> ibis_expr_builders.LegacyWindowBuilder: if isinstance(bounds, RangeWindowBounds): return ibis_window.range( - start=_to_ibis_boundary(bounds.start), - end=_to_ibis_boundary(bounds.end), + start=_to_ibis_boundary( + None + if bounds.start is None + else utils.timedelta_to_micros(bounds.start) + ), + end=_to_ibis_boundary( + None if bounds.end is None else utils.timedelta_to_micros(bounds.end) + ), ) if isinstance(bounds, RowsWindowBounds): if bounds.start is not None or bounds.end is not None: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 3d9bf19f76..04d3ea1bf9 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -14,7 +14,6 @@ from __future__ import annotations import functools -import io import typing import bigframes_vendored.ibis.backends.bigquery as ibis_bigquery @@ -22,16 +21,13 @@ import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types import google.cloud.bigquery -import pandas as pd +import pyarrow as pa from bigframes import dtypes, operations -from bigframes.core import utils import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.explode -import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as compile_scalar -import bigframes.core.compile.schema_translator import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering import bigframes.core.rewrite as rewrites @@ -86,6 +82,7 @@ def _replace_unsupported_ops(node: nodes.BigFrameNode): # TODO: Run all replacement rules as single bottom-up pass node = nodes.bottom_up(node, rewrites.rewrite_slice) node = nodes.bottom_up(node, rewrites.rewrite_timedelta_expressions) + node = nodes.bottom_up(node, rewrites.rewrite_range_rolling) return node @@ -161,19 +158,22 @@ def compile_fromrange( @_compile_node.register def compile_readlocal(node: nodes.ReadLocalNode, *args): - array_as_pd = pd.read_feather( - io.BytesIO(node.feather_bytes), - columns=[item.source_id for item in node.scan_list.items], - ) - - # Convert timedeltas to microseconds for compatibility with BigQuery - _ = utils.replace_timedeltas_with_micros(array_as_pd) - offsets = node.offsets_col.sql if node.offsets_col else None - return compiled.UnorderedIR.from_pandas( - array_as_pd, node.scan_list, offsets=offsets + pa_table = node.local_data_source.data + bq_schema = node.schema.to_bigquery() + + pa_table = pa_table.select(list(item.source_id for item in node.scan_list.items)) + pa_table = pa_table.rename_columns( + {item.source_id: item.id.sql for item in node.scan_list.items} ) + if offsets: + pa_table = pa_table.append_column( + offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) + ) + bq_schema = (*bq_schema, google.cloud.bigquery.SchemaField(offsets, "INT64")) + return compiled.UnorderedIR.from_polars(pa_table, bq_schema) + @_compile_node.register def compile_readtable(node: nodes.ReadTableNode, *args): diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 54b0a1408a..d5f9b5c5f9 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -388,7 +388,8 @@ def literal_to_ibis_scalar( # Ibis has bug for casting nulltype to geospatial, so we perform intermediate cast first geotype = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) return bigframes_vendored.ibis.literal(None, geotype) - ibis_dtype = BIGFRAMES_TO_IBIS[force_dtype] if force_dtype else None + + ibis_dtype = bigframes_dtype_to_ibis_dtype(force_dtype) if force_dtype else None if pd.api.types.is_list_like(literal): if validate: diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 6fac3c9b92..baa19eb990 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,7 +16,7 @@ import dataclasses import functools import itertools -from typing import cast, Optional, Sequence, Tuple, TYPE_CHECKING, Union +from typing import cast, Optional, Sequence, Tuple, TYPE_CHECKING import bigframes.core from bigframes.core import window_spec @@ -205,11 +205,10 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): cols_to_read = { scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items } - return ( - pl.read_ipc(node.feather_bytes, columns=list(cols_to_read.keys())) - .lazy() - .rename(cols_to_read) - ) + lazy_frame = cast( + pl.DataFrame, pl.from_arrow(node.local_data_source.data) + ).lazy() + return lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) @compile_node.register def compile_filter(self, node: nodes.FilterNode): @@ -360,6 +359,7 @@ def compile_window(self, node: nodes.WindowOpNode): return df.with_columns([agg_expr]) else: # row-bounded window + assert isinstance(window.bounds, window_spec.RowsWindowBounds) # Polars API semi-bounded, and any grouped rolling window challenging # https://github.com/pola-rs/polars/issues/4799 # https://github.com/pola-rs/polars/issues/8976 @@ -383,9 +383,7 @@ def compile_window(self, node: nodes.WindowOpNode): return pl.concat([df, results], how="horizontal") -def _get_period( - bounds: Union[window_spec.RowsWindowBounds, window_spec.RangeWindowBounds] -) -> Optional[int]: +def _get_period(bounds: window_spec.RowsWindowBounds) -> Optional[int]: """Returns None if the boundary is infinite.""" if bounds.start is None or bounds.end is None: return None diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 0296762447..eda70f5cf1 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -747,12 +747,12 @@ def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): @scalar_op_compiler.register_binary_op(ops.date_add_op) def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast("timestamp") + y.to_interval("us") # type: ignore + return x.cast(ibis_dtypes.timestamp()) + y.to_interval("us") # type: ignore @scalar_op_compiler.register_binary_op(ops.date_sub_op) def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast("timestamp") - y.to_interval("us") # type: ignore + return x.cast(ibis_dtypes.timestamp()) - y.to_interval("us") # type: ignore @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) diff --git a/bigframes/core/compile/sqlglot/__init__.py b/bigframes/core/compile/sqlglot/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/compile/sqlglot/sqlglot_types.py b/bigframes/core/compile/sqlglot/sqlglot_types.py new file mode 100644 index 0000000000..06c78c1435 --- /dev/null +++ b/bigframes/core/compile/sqlglot/sqlglot_types.py @@ -0,0 +1,84 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import bigframes_vendored.constants as constants +import numpy as np +import pandas as pd +import pyarrow as pa +import sqlglot as sg + +import bigframes.dtypes + + +class SQLGlotType: + @classmethod + def from_bigframes_dtype( + cls, + bigframes_dtype: typing.Union[ + bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[typing.Any] + ], + ): + if bigframes_dtype == bigframes.dtypes.INT_DTYPE: + return "INT64" + elif bigframes_dtype == bigframes.dtypes.FLOAT_DTYPE: + return "FLOAT64" + elif bigframes_dtype == bigframes.dtypes.STRING_DTYPE: + return "STRING" + elif bigframes_dtype == bigframes.dtypes.BOOL_DTYPE: + return "BOOLEAN" + elif bigframes_dtype == bigframes.dtypes.DATE_DTYPE: + return "DATE" + elif bigframes_dtype == bigframes.dtypes.TIME_DTYPE: + return "TIME" + elif bigframes_dtype == bigframes.dtypes.DATETIME_DTYPE: + return "DATETIME" + elif bigframes_dtype == bigframes.dtypes.TIMESTAMP_DTYPE: + return "TIMESTAMP" + elif bigframes_dtype == bigframes.dtypes.BYTES_DTYPE: + return "BYTES" + elif bigframes_dtype == bigframes.dtypes.NUMERIC_DTYPE: + return "NUMERIC" + elif bigframes_dtype == bigframes.dtypes.BIGNUMERIC_DTYPE: + return "BIGNUMERIC" + elif bigframes_dtype == bigframes.dtypes.JSON_DTYPE: + return "JSON" + elif bigframes_dtype == bigframes.dtypes.GEO_DTYPE: + return "GEOGRAPHY" + elif isinstance(bigframes_dtype, pd.ArrowDtype): + if pa.types.is_list(bigframes_dtype.pyarrow_dtype): + inner_bigframes_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( + bigframes_dtype.pyarrow_dtype.value_type + ) + return ( + f"ARRAY<{SQLGlotType.from_bigframes_dtype(inner_bigframes_dtype)}>" + ) + elif pa.types.is_struct(bigframes_dtype.pyarrow_dtype): + struct_type = typing.cast(pa.StructType, bigframes_dtype.pyarrow_dtype) + inner_fields: list[str] = [] + for i in range(struct_type.num_fields): + field = struct_type.field(i) + key = sg.to_identifier(field.name).sql("bigquery") + dtype = SQLGlotType.from_bigframes_dtype( + bigframes.dtypes.arrow_dtype_to_bigframes_dtype(field.type) + ) + inner_fields.append(f"{key} {dtype}") + return "STRUCT<{}>".format(", ".join(inner_fields)) + + raise ValueError( + f"Unsupported type for {bigframes_dtype}. {constants.FEEDBACK_LINK}" + ) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index b97a5f4c48..f234bad126 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -14,12 +14,13 @@ from __future__ import annotations +import datetime import typing from typing import Literal, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby -import jellyfish +import numpy import pandas as pd from bigframes import session @@ -31,6 +32,7 @@ import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.validations as validations +from bigframes.core.window import rolling import bigframes.core.window as windows import bigframes.core.window_spec as window_specs import bigframes.dataframe as df @@ -87,6 +89,8 @@ def __getitem__( typing.Sequence[blocks.Label], ], ): + import bigframes._tools.strings + if utils.is_list_like(key): keys = list(key) else: @@ -101,7 +105,7 @@ def __getitem__( possible_key.append( min( self._block.column_labels, - key=lambda item: jellyfish.damerau_levenshtein_distance( + key=lambda item: bigframes._tools.strings.levenshtein_distance( bad_key, item ), ) @@ -308,20 +312,41 @@ def diff(self, periods=1) -> series.Series: @validations.requires_ordering() def rolling( self, - window: int, + window: int | pd.Timedelta | numpy.timedelta64 | datetime.timedelta | str, min_periods=None, + on: str | None = None, closed: Literal["right", "left", "both", "neither"] = "right", ) -> windows.Window: - window_spec = window_specs.WindowSpec( - bounds=window_specs.RowsWindowBounds.from_window_size(window, closed), - min_periods=min_periods if min_periods is not None else window, - grouping_keys=tuple(ex.deref(col) for col in self._by_col_ids), - ) - block = self._block.order_by( - [order.ascending_over(col) for col in self._by_col_ids], - ) - return windows.Window( - block, window_spec, self._selected_cols, drop_null_groups=self._dropna + if isinstance(window, int): + window_spec = window_specs.WindowSpec( + bounds=window_specs.RowsWindowBounds.from_window_size(window, closed), + min_periods=min_periods if min_periods is not None else window, + grouping_keys=tuple(ex.deref(col) for col in self._by_col_ids), + ) + block = self._block.order_by( + [order.ascending_over(col) for col in self._by_col_ids], + ) + skip_agg_col_id = ( + None if on is None else self._block.resolve_label_exact_or_error(on) + ) + return windows.Window( + block, + window_spec, + self._selected_cols, + drop_null_groups=self._dropna, + skip_agg_column_id=skip_agg_col_id, + ) + + return rolling.create_range_window( + self._block, + window, + min_periods=min_periods, + value_column_ids=self._selected_cols, + on=on, + closed=closed, + is_series=False, + grouping_keys=self._by_col_ids, + drop_null_groups=self._dropna, ) @validations.requires_ordering() @@ -511,7 +536,7 @@ def _aggregate_all( def _apply_window_op( self, - op: agg_ops.WindowOp, + op: agg_ops.UnaryWindowOp, window: typing.Optional[window_specs.WindowSpec] = None, numeric_only: bool = False, ): diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index 761a02bd34..a29bb45a32 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -14,11 +14,14 @@ from __future__ import annotations +import datetime import typing from typing import Literal, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby +import numpy +import pandas from bigframes import session from bigframes.core import expression as ex @@ -29,6 +32,7 @@ import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.validations as validations +from bigframes.core.window import rolling import bigframes.core.window as windows import bigframes.core.window_spec as window_specs import bigframes.dataframe as df @@ -246,24 +250,36 @@ def diff(self, periods=1) -> series.Series: @validations.requires_ordering() def rolling( self, - window: int, + window: int | pandas.Timedelta | numpy.timedelta64 | datetime.timedelta | str, min_periods=None, closed: Literal["right", "left", "both", "neither"] = "right", ) -> windows.Window: - window_spec = window_specs.WindowSpec( - bounds=window_specs.RowsWindowBounds.from_window_size(window, closed), - min_periods=min_periods if min_periods is not None else window, - grouping_keys=tuple(ex.deref(col) for col in self._by_col_ids), - ) - block = self._block.order_by( - [order.ascending_over(col) for col in self._by_col_ids], - ) - return windows.Window( - block, - window_spec, - [self._value_column], - drop_null_groups=self._dropna, + if isinstance(window, int): + window_spec = window_specs.WindowSpec( + bounds=window_specs.RowsWindowBounds.from_window_size(window, closed), + min_periods=min_periods if min_periods is not None else window, + grouping_keys=tuple(ex.deref(col) for col in self._by_col_ids), + ) + block = self._block.order_by( + [order.ascending_over(col) for col in self._by_col_ids], + ) + return windows.Window( + block, + window_spec, + [self._value_column], + drop_null_groups=self._dropna, + is_series=True, + ) + + return rolling.create_range_window( + self._block, + window, + min_periods=min_periods, + value_column_ids=[self._value_column], + closed=closed, is_series=True, + grouping_keys=self._by_col_ids, + drop_null_groups=self._dropna, ) @validations.requires_ordering() @@ -294,7 +310,7 @@ def _aggregate(self, aggregate_op: agg_ops.UnaryAggregateOp) -> series.Series: def _apply_window_op( self, - op: agg_ops.WindowOp, + op: agg_ops.UnaryWindowOp, discard_name=False, window: typing.Optional[window_specs.WindowSpec] = None, never_skip_nulls: bool = False, diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index d891e385d5..70b1741af7 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -16,42 +16,281 @@ from __future__ import annotations +import dataclasses +import functools +import io +import itertools +import json +from typing import Any, Callable, cast, Generator, Iterable, Literal, Optional, Union +import uuid + +import geopandas # type: ignore +import numpy as np +import pandas import pyarrow as pa +import pyarrow.parquet # type: ignore import bigframes.core.schema as schemata import bigframes.dtypes -def arrow_schema_to_bigframes(arrow_schema: pa.Schema) -> schemata.ArraySchema: - """Infer the corresponding bigframes schema given a pyarrow schema.""" - schema_items = tuple( - schemata.SchemaItem( - field.name, - bigframes_type_for_arrow_type(field.type), +@dataclasses.dataclass(frozen=True) +class LocalTableMetadata: + total_bytes: int + row_count: int + + @classmethod + def from_arrow(cls, table: pa.Table) -> LocalTableMetadata: + return cls(total_bytes=table.nbytes, row_count=table.num_rows) + + +_MANAGED_STORAGE_TYPES_OVERRIDES: dict[bigframes.dtypes.Dtype, pa.DataType] = { + # wkt to be precise + bigframes.dtypes.GEO_DTYPE: pa.string(), + # Just json as string + bigframes.dtypes.JSON_DTYPE: pa.string(), +} + + +@dataclasses.dataclass(frozen=True) +class ManagedArrowTable: + data: pa.Table = dataclasses.field(hash=False) + schema: schemata.ArraySchema = dataclasses.field(hash=False) + id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4) + + def __post_init__(self): + self.validate() + + @functools.cached_property + def metadata(self) -> LocalTableMetadata: + return LocalTableMetadata.from_arrow(self.data) + + @classmethod + def from_pandas(cls, dataframe: pandas.DataFrame) -> ManagedArrowTable: + """Creates managed table from pandas. Ignores index, col names must be unique strings""" + columns: list[pa.ChunkedArray] = [] + fields: list[schemata.SchemaItem] = [] + column_names = list(dataframe.columns) + assert len(column_names) == len(set(column_names)) + + for name, col in dataframe.items(): + new_arr, bf_type = _adapt_pandas_series(col) + columns.append(new_arr) + fields.append(schemata.SchemaItem(str(name), bf_type)) + + return ManagedArrowTable( + pa.table(columns, names=column_names), schemata.ArraySchema(tuple(fields)) ) - for field in arrow_schema - ) - return schemata.ArraySchema(schema_items) + @classmethod + def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable: + columns: list[pa.ChunkedArray] = [] + fields: list[schemata.SchemaItem] = [] + for name, arr in zip(table.column_names, table.columns): + new_arr, bf_type = _adapt_arrow_array(arr) + columns.append(new_arr) + fields.append(schemata.SchemaItem(name, bf_type)) -def adapt_pa_table(arrow_table: pa.Table) -> pa.Table: - """Adapt a pyarrow table to one that can be handled by bigframes. Converts tz to UTC and unit to us for temporal types.""" - new_schema = pa.schema( - [ - pa.field(field.name, arrow_type_replacements(field.type)) - for field in arrow_table.schema - ] - ) - return arrow_table.cast(new_schema) + return ManagedArrowTable( + pa.table(columns, names=table.column_names), + schemata.ArraySchema(tuple(fields)), + ) + + def to_parquet( + self, + dst: Union[str, io.IOBase], + *, + offsets_col: Optional[str] = None, + geo_format: Literal["wkb", "wkt"] = "wkt", + duration_type: Literal["int", "duration"] = "duration", + json_type: Literal["string"] = "string", + ): + pa_table = self.data + if offsets_col is not None: + pa_table = pa_table.append_column( + offsets_col, pa.array(range(pa_table.num_rows), type=pa.int64()) + ) + if geo_format != "wkt": + raise NotImplementedError(f"geo format {geo_format} not yet implemented") + if duration_type != "duration": + raise NotImplementedError( + f"duration as {duration_type} not yet implemented" + ) + assert json_type == "string" + pyarrow.parquet.write_table(pa_table, where=dst) + + def itertuples( + self, + *, + geo_format: Literal["wkb", "wkt"] = "wkt", + duration_type: Literal["int", "timedelta"] = "timedelta", + json_type: Literal["string", "object"] = "string", + ) -> Iterable[tuple]: + """ + Yield each row as an unlabeled tuple. + + Row-wise iteration of columnar data is slow, avoid if possible. + """ + for row_dict in _iter_table( + self.data, + self.schema, + geo_format=geo_format, + duration_type=duration_type, + json_type=json_type, + ): + yield tuple(row_dict.values()) + + def validate(self): + # TODO: Content-based validation for some datatypes (eg json, wkt, list) where logical domain is smaller than pyarrow type + for bf_field, arrow_field in zip(self.schema.items, self.data.schema): + expected_arrow_type = _get_managed_storage_type(bf_field.dtype) + arrow_type = arrow_field.type + if expected_arrow_type != arrow_type: + raise TypeError( + f"Field {bf_field} has arrow array type: {arrow_type}, expected type: {expected_arrow_type}" + ) + + +# Sequential iterator, but could split into batches and leverage parallelism for speed +def _iter_table( + table: pa.Table, + schema: schemata.ArraySchema, + *, + geo_format: Literal["wkb", "wkt"] = "wkt", + duration_type: Literal["int", "timedelta"] = "timedelta", + json_type: Literal["string", "object"] = "string", +) -> Generator[dict[str, Any], None, None]: + """For when you feel like iterating row-wise over a column store. Don't expect speed.""" + + if geo_format != "wkt": + raise NotImplementedError(f"geo format {geo_format} not yet implemented") + @functools.singledispatch + def iter_array( + array: pa.Array, dtype: bigframes.dtypes.Dtype + ) -> Generator[Any, None, None]: + values = array.to_pylist() + if dtype == bigframes.dtypes.JSON_DTYPE: + if json_type == "object": + yield from map(lambda x: json.loads(x) if x is not None else x, values) + else: + yield from values + elif dtype == bigframes.dtypes.TIMEDELTA_DTYPE: + if duration_type == "int": + yield from map( + lambda x: ((x.days * 3600 * 24) + x.seconds) * 1_000_000 + + x.microseconds + if x is not None + else x, + values, + ) + else: + yield from values + else: + yield from values -def bigframes_type_for_arrow_type(pa_type: pa.DataType) -> bigframes.dtypes.Dtype: - return bigframes.dtypes.arrow_dtype_to_bigframes_dtype( - arrow_type_replacements(pa_type) + @iter_array.register + def _( + array: pa.ListArray, dtype: bigframes.dtypes.Dtype + ) -> Generator[Any, None, None]: + value_generator = iter_array( + array.flatten(), bigframes.dtypes.get_array_inner_type(dtype) + ) + for (start, end) in itertools.pairwise(array.offsets): + arr_size = end.as_py() - start.as_py() + yield list(itertools.islice(value_generator, arr_size)) + + @iter_array.register + def _( + array: pa.StructArray, dtype: bigframes.dtypes.Dtype + ) -> Generator[Any, None, None]: + # yield from each subarray + sub_generators: dict[str, Generator[Any, None, None]] = {} + for field_name, dtype in bigframes.dtypes.get_struct_fields(dtype).items(): + sub_generators[field_name] = iter_array(array.field(field_name), dtype) + + keys = list(sub_generators.keys()) + for row_values in zip(*sub_generators.values()): + yield {key: value for key, value in zip(keys, row_values)} + + for batch in table.to_batches(): + sub_generators: dict[str, Generator[Any, None, None]] = {} + for field in schema.items: + sub_generators[field.column] = iter_array( + batch.column(field.column), field.dtype + ) + + keys = list(sub_generators.keys()) + for row_values in zip(*sub_generators.values()): + yield {key: value for key, value in zip(keys, row_values)} + + +def _adapt_pandas_series( + series: pandas.Series, +) -> tuple[Union[pa.ChunkedArray, pa.Array], bigframes.dtypes.Dtype]: + # Mostly rely on pyarrow conversions, but have to convert geo without its help. + if series.dtype == bigframes.dtypes.GEO_DTYPE: + series = geopandas.GeoSeries(series).to_wkt(rounding_precision=-1) + return pa.array(series, type=pa.string()), bigframes.dtypes.GEO_DTYPE + try: + return _adapt_arrow_array(pa.array(series)) + except pa.ArrowInvalid as e: + if series.dtype == np.dtype("O"): + try: + return _adapt_pandas_series(series.astype(bigframes.dtypes.GEO_DTYPE)) + except TypeError: + # Prefer original error + pass + raise e + + +def _adapt_arrow_array( + array: Union[pa.ChunkedArray, pa.Array] +) -> tuple[Union[pa.ChunkedArray, pa.Array], bigframes.dtypes.Dtype]: + target_type = _logical_type_replacements(array.type) + if target_type != array.type: + # TODO: Maybe warn if lossy conversion? + array = array.cast(target_type) + bf_type = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(target_type) + + storage_type = _get_managed_storage_type(bf_type) + if storage_type != array.type: + array = array.cast(storage_type) + return array, bf_type + + +def _get_managed_storage_type(dtype: bigframes.dtypes.Dtype) -> pa.DataType: + if dtype in _MANAGED_STORAGE_TYPES_OVERRIDES.keys(): + return _MANAGED_STORAGE_TYPES_OVERRIDES[dtype] + return _physical_type_replacements( + bigframes.dtypes.bigframes_dtype_to_arrow_dtype(dtype) ) -def arrow_type_replacements(type: pa.DataType) -> pa.DataType: +def _recursive_map_types( + f: Callable[[pa.DataType], pa.DataType] +) -> Callable[[pa.DataType], pa.DataType]: + @functools.wraps(f) + def recursive_f(type: pa.DataType) -> pa.DataType: + if pa.types.is_list(type): + new_field_t = recursive_f(type.value_type) + if new_field_t != type.value_type: + return pa.list_(new_field_t) + return type + if pa.types.is_struct(type): + struct_type = cast(pa.StructType, type) + new_fields: list[pa.Field] = [] + for i in range(struct_type.num_fields): + field = struct_type.field(i) + new_fields.append(field.with_type(recursive_f(field.type))) + return pa.struct(new_fields) + return f(type) + + return recursive_f + + +@_recursive_map_types +def _logical_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_timestamp(type): # This is potentially lossy, but BigFrames doesn't support ns new_tz = "UTC" if (type.tz is not None) else None @@ -66,10 +305,27 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType: return pa.decimal128(38, 9) if pa.types.is_decimal256(type): return pa.decimal256(76, 38) - if pa.types.is_dictionary(type): - return arrow_type_replacements(type.value_type) if pa.types.is_large_string(type): # simple string type can handle the largest strings needed return pa.string() + if pa.types.is_dictionary(type): + return _logical_type_replacements(type.value_type) + if pa.types.is_null(type): + # null as a type not allowed, default type is float64 for bigframes + return pa.float64() else: return type + + +_ARROW_MANAGED_STORAGE_OVERRIDES = { + bigframes.dtypes._BIGFRAMES_TO_ARROW[bf_dtype]: arrow_type + for bf_dtype, arrow_type in _MANAGED_STORAGE_TYPES_OVERRIDES.items() + if bf_dtype in bigframes.dtypes._BIGFRAMES_TO_ARROW +} + + +@_recursive_map_types +def _physical_type_replacements(dtype: pa.DataType) -> pa.DataType: + if dtype in _ARROW_MANAGED_STORAGE_OVERRIDES: + return _ARROW_MANAGED_STORAGE_OVERRIDES[dtype] + return dtype diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 714a522183..8be46f531c 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -110,25 +110,42 @@ def submit_pandas_labels( bq_client.query(query, job_config=job_config) -def class_logger(decorated_cls): +def class_logger(decorated_cls=None, /, *, include_internal_calls=False): """Decorator that adds logging functionality to each method of the class.""" - for attr_name, attr_value in decorated_cls.__dict__.items(): - if callable(attr_value) and (attr_name not in _excluded_methods): - if isinstance(attr_value, staticmethod): - # TODO(b/390244171) support for staticmethod - pass - else: + + def wrap(cls): + for attr_name, attr_value in cls.__dict__.items(): + if callable(attr_value) and (attr_name not in _excluded_methods): + if isinstance(attr_value, staticmethod): + # TODO(b/390244171) support for staticmethod + pass + else: + setattr( + cls, + attr_name, + method_logger( + attr_value, + cls, + include_internal_calls, + ), + ) + elif isinstance(attr_value, property): setattr( - decorated_cls, attr_name, method_logger(attr_value, decorated_cls) + cls, + attr_name, + property_logger(attr_value, cls, include_internal_calls), ) - elif isinstance(attr_value, property): - setattr( - decorated_cls, attr_name, property_logger(attr_value, decorated_cls) - ) - return decorated_cls + return cls + + if decorated_cls is None: + # The logger is used with parentheses + return wrap + + # The logger is used without parentheses + return wrap(decorated_cls) -def method_logger(method, decorated_cls): +def method_logger(method, decorated_cls, include_internal_calls: bool): """Decorator that adds logging functionality to a method.""" @functools.wraps(method) @@ -138,7 +155,7 @@ def wrapper(self, *args, **kwargs): full_method_name = f"{class_name.lower()}-{api_method_name}" # Track directly called methods - if len(_call_stack) == 0: + if len(_call_stack) == 0 or include_internal_calls: add_api_method(full_method_name) _call_stack.append(full_method_name) @@ -167,7 +184,7 @@ def wrapper(self, *args, **kwargs): return wrapper -def property_logger(prop, decorated_cls): +def property_logger(prop, decorated_cls, include_internal_calls: bool): """Decorator that adds logging functionality to a property.""" def shared_wrapper(f): @@ -177,7 +194,7 @@ def wrapped(*args, **kwargs): property_name = f.__name__ full_property_name = f"{class_name.lower()}-{property_name.lower()}" - if len(_call_stack) == 0: + if len(_call_stack) == 0 or include_internal_calls: add_api_method(full_property_name) _call_stack.append(full_property_name) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index fbc43e033a..99c8f09bc0 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,16 +20,23 @@ import functools import itertools import typing -from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple +from typing import ( + AbstractSet, + Callable, + cast, + Iterable, + Mapping, + Optional, + Sequence, + Tuple, +) import google.cloud.bigquery as bq -from bigframes.core import identifiers +from bigframes.core import identifiers, local_data from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field import bigframes.core.expression as ex -import bigframes.core.guid from bigframes.core.ordering import OrderingExpression -import bigframes.core.schema as schemata import bigframes.core.slices as slices import bigframes.core.window_spec as window import bigframes.dtypes @@ -574,16 +581,44 @@ def with_id(self, id: identifiers.ColumnId) -> ScanItem: @dataclasses.dataclass(frozen=True) class ScanList: + """ + Defines the set of columns to scan from a source, along with the variable to bind the columns to. + """ + items: typing.Tuple[ScanItem, ...] + def filter_cols( + self, + ids: AbstractSet[identifiers.ColumnId], + ) -> ScanList: + """Drop columns from the scan that except those in the 'ids' arg.""" + result = ScanList(tuple(item for item in self.items if item.id in ids)) + if len(result.items) == 0: + # We need to select something, or sql syntax breaks + result = ScanList(self.items[:1]) + return result + + def project( + self, + selections: Mapping[identifiers.ColumnId, identifiers.ColumnId], + ) -> ScanList: + """Project given ids from the scanlist, dropping previous bindings.""" + by_id = {item.id: item for item in self.items} + result = ScanList( + tuple( + by_id[old_id].with_id(new_id) for old_id, new_id in selections.items() + ) + ) + if len(result.items) == 0: + # We need to select something, or sql syntax breaks + result = ScanList((self.items[:1])) + return result + @dataclasses.dataclass(frozen=True, eq=False) class ReadLocalNode(LeafNode): - # TODO: Combine feather_bytes, data_schema, n_rows into a LocalDataDef struct # TODO: Track nullability for local data - feather_bytes: bytes - data_schema: schemata.ArraySchema - n_rows: int + local_data_source: local_data.ManagedArrowTable # Mapping of local ids to bfet id. scan_list: ScanList # Offsets are generated only if this is non-null @@ -623,7 +658,7 @@ def explicitly_ordered(self) -> bool: @property def row_count(self) -> typing.Optional[int]: - return self.n_rows + return self.local_data_source.metadata.row_count @property def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: @@ -659,7 +694,6 @@ class GbqTable: dataset_id: str = dataclasses.field() table_id: str = dataclasses.field() physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field() - n_rows: int = dataclasses.field() is_physically_stored: bool = dataclasses.field() cluster_cols: typing.Optional[Tuple[str, ...]] @@ -675,13 +709,17 @@ def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable: dataset_id=table.dataset_id, table_id=table.table_id, physical_schema=schema, - n_rows=table.num_rows, is_physically_stored=(table.table_type in ["TABLE", "MATERIALIZED_VIEW"]), cluster_cols=None if table.clustering_fields is None else tuple(table.clustering_fields), ) + def get_table_ref(self) -> bq.TableReference: + return bq.TableReference( + bq.DatasetReference(self.project_id, self.dataset_id), self.table_id + ) + @property @functools.cache def schema_by_id(self): @@ -701,6 +739,7 @@ class BigqueryDataSource: # Added for backwards compatibility, not validated sql_predicate: typing.Optional[str] = None ordering: typing.Optional[orderings.RowOrdering] = None + n_rows: Optional[int] = None ## Put ordering in here or just add order_by node above? @@ -778,7 +817,7 @@ def variables_introduced(self) -> int: @property def row_count(self) -> typing.Optional[int]: if self.source.sql_predicate is None and self.source.table.is_physically_stored: - return self.source.table.n_rows + return self.source.n_rows return None @property @@ -1074,6 +1113,11 @@ def variables_introduced(self) -> int: # This operation only renames variables, doesn't actually create new ones return 0 + @property + def has_multi_referenced_ids(self) -> bool: + referenced = tuple(ref.ref.id for ref in self.input_output_pairs) + return len(referenced) != len(set(referenced)) + # TODO: Reuse parent namespace # Currently, Selection node allows renaming an reusing existing names, so it must establish a # new namespace. @@ -1358,7 +1402,7 @@ def _validate(self): """Validate the local data in the node.""" # Since inner order and row bounds are coupled, rank ops can't be row bounded assert ( - not self.window_spec.row_bounded + not self.window_spec.is_row_bounded ) or self.expression.op.implicitly_inherits_order assert all(ref in self.child.ids for ref in self.expression.column_references) @@ -1420,7 +1464,9 @@ def inherits_order(self) -> bool: op_inherits_order = ( not self.expression.op.order_independent ) and self.expression.op.implicitly_inherits_order - return op_inherits_order or self.window_spec.row_bounded + # range-bounded windows do not inherit orders because their ordering are + # already defined before rewrite time. + return op_inherits_order or self.window_spec.is_row_bounded @property def additive_base(self) -> BigFrameNode: diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index d9a5a87145..86ccf52408 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -20,6 +20,7 @@ import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile import pandas as pd +import bigframes.constants import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -41,15 +42,37 @@ def cut( right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, ) -> bigframes.series.Series: - if labels is not None and labels is not False: + if ( + labels is not None + and labels is not False + and not isinstance(labels, typing.Iterable) + ): + raise ValueError( + "Bin labels must either be False, None or passed in as a list-like argument" + ) + if ( + isinstance(labels, typing.Iterable) + and len(list(labels)) > 0 + and not isinstance(list(labels)[0], str) + ): raise NotImplementedError( - "The 'labels' parameter must be either False or None. " - "Please provide a valid value for 'labels'." + "When using an iterable for labels, only iterables of strings are supported " + f"but found {type(list(labels)[0])}. {constants.FEEDBACK_LINK}" ) + if x.size == 0: + raise ValueError("Cannot cut empty array.") + if isinstance(bins, int): if bins <= 0: raise ValueError("`bins` should be a positive integer.") + if isinstance(labels, typing.Iterable): + labels = tuple(labels) + if len(labels) != bins: + raise ValueError( + f"Bin labels({len(labels)}) must be same as the value of bins({bins})" + ) + op = agg_ops.CutOp(bins, right=right, labels=labels) return x._apply_window_op(op, window_spec=window_specs.unbound()) elif isinstance(bins, typing.Iterable): @@ -58,6 +81,7 @@ def cut( bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) # To maintain consistency with pandas' behavior right = True + labels = None elif len(list(bins)) == 0: as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple() @@ -66,6 +90,7 @@ def cut( bins = tuple(bins) # To maintain consistency with pandas' behavior right = True + labels = None elif pd.api.types.is_number(list(bins)[0]): bins_list = list(bins) as_index = pd.IntervalIndex.from_breaks(bins_list) @@ -81,11 +106,24 @@ def cut( raise ValueError("`bins` iterable should contain tuples or numerics.") if as_index.is_overlapping: - raise ValueError("Overlapping IntervalIndex is not accepted.") - elif len(as_index) == 0: - op = agg_ops.CutOp(bins, right=right, labels=labels) + raise ValueError("Overlapping IntervalIndex is not accepted.") # TODO: test + + if isinstance(labels, typing.Iterable): + labels = tuple(labels) + if len(labels) != len(as_index): + raise ValueError( + f"Bin labels({len(labels)}) must be same as the number of bin edges" + f"({len(as_index)})" + ) + + if len(as_index) == 0: + dtype = agg_ops.CutOp(bins, right=right, labels=labels).output_type() return bigframes.series.Series( - [pd.NA] * len(x), dtype=op.output_type(), name=x.name + [pd.NA] * len(x), + dtype=dtype, + name=x.name, + index=x.index, + session=x._session, ) else: op = agg_ops.CutOp(bins, right=right, labels=labels) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index e5f7578911..128cefe94c 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -17,8 +17,10 @@ from bigframes.core.rewrite.legacy_align import legacy_join_as_projection from bigframes.core.rewrite.order import pull_up_order from bigframes.core.rewrite.pruning import column_pruning +from bigframes.core.rewrite.scan_reduction import try_reduce_to_table_scan from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions +from bigframes.core.rewrite.windows import rewrite_range_rolling __all__ = [ "legacy_join_as_projection", @@ -29,4 +31,6 @@ "remap_variables", "pull_up_order", "column_pruning", + "rewrite_range_rolling", + "try_reduce_to_table_scan", ] diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py index 5a94f2aa40..5f4990094c 100644 --- a/bigframes/core/rewrite/pruning.py +++ b/bigframes/core/rewrite/pruning.py @@ -170,7 +170,7 @@ def prune_readlocal( node: bigframes.core.nodes.ReadLocalNode, selection: AbstractSet[identifiers.ColumnId], ) -> bigframes.core.nodes.ReadLocalNode: - new_scan_list = filter_scanlist(node.scan_list, selection) + new_scan_list = node.scan_list.filter_cols(selection) return dataclasses.replace( node, scan_list=new_scan_list, @@ -183,18 +183,5 @@ def prune_readtable( node: bigframes.core.nodes.ReadTableNode, selection: AbstractSet[identifiers.ColumnId], ) -> bigframes.core.nodes.ReadTableNode: - new_scan_list = filter_scanlist(node.scan_list, selection) + new_scan_list = node.scan_list.filter_cols(selection) return dataclasses.replace(node, scan_list=new_scan_list) - - -def filter_scanlist( - scanlist: bigframes.core.nodes.ScanList, - ids: AbstractSet[identifiers.ColumnId], -): - result = bigframes.core.nodes.ScanList( - tuple(item for item in scanlist.items if item.id in ids) - ) - if len(result.items) == 0: - # We need to select something, or stuff breaks - result = bigframes.core.nodes.ScanList(scanlist.items[:1]) - return result diff --git a/bigframes/core/rewrite/scan_reduction.py b/bigframes/core/rewrite/scan_reduction.py new file mode 100644 index 0000000000..be8db4827c --- /dev/null +++ b/bigframes/core/rewrite/scan_reduction.py @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import dataclasses +import functools +from typing import Optional + +from bigframes.core import nodes + + +def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTableNode]: + for node in root.unique_nodes(): + if not isinstance(node, (nodes.ReadTableNode, nodes.SelectionNode)): + return None + result = root.bottom_up(merge_scan) + if isinstance(result, nodes.ReadTableNode): + return result + return None + + +@functools.singledispatch +def merge_scan(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + return node + + +@merge_scan.register +def _(node: nodes.SelectionNode) -> nodes.BigFrameNode: + if not isinstance(node.child, nodes.ReadTableNode): + return node + if node.has_multi_referenced_ids: + return node + + selection = { + aliased_ref.ref.id: aliased_ref.id for aliased_ref in node.input_output_pairs + } + new_scan_list = node.child.scan_list.project(selection) + return dataclasses.replace(node.child, scan_list=new_scan_list) diff --git a/bigframes/core/rewrite/windows.py b/bigframes/core/rewrite/windows.py new file mode 100644 index 0000000000..9f55db23af --- /dev/null +++ b/bigframes/core/rewrite/windows.py @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses + +from bigframes import operations as ops +from bigframes.core import nodes + + +def rewrite_range_rolling(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if not isinstance(node, nodes.WindowOpNode): + return node + + if not node.window_spec.is_range_bounded: + return node + + if len(node.window_spec.ordering) != 1: + raise ValueError( + "Range rolling should only be performed on exactly one column." + ) + + ordering_expr = node.window_spec.ordering[0] + + new_ordering = dataclasses.replace( + ordering_expr, + scalar_expression=ops.UnixMicros().as_expr(ordering_expr.scalar_expression), + ) + + return dataclasses.replace( + node, + window_spec=dataclasses.replace(node.window_spec, ordering=(new_ordering,)), + ) diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index c379db72be..c4cbb51ef9 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -67,9 +67,13 @@ def dtypes(self) -> typing.Tuple[bigframes.dtypes.Dtype, ...]: def _mapping(self) -> typing.Dict[ColumnIdentifierType, bigframes.dtypes.Dtype]: return {item.column: item.dtype for item in self.items} - def to_bigquery(self) -> typing.Tuple[google.cloud.bigquery.SchemaField, ...]: + def to_bigquery( + self, overrides: dict[bigframes.dtypes.Dtype, str] = {} + ) -> typing.Tuple[google.cloud.bigquery.SchemaField, ...]: return tuple( - bigframes.dtypes.convert_to_schema_field(item.column, item.dtype) + bigframes.dtypes.convert_to_schema_field( + item.column, item.dtype, overrides=overrides + ) for item in self.items ) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index f4de177f37..d197993305 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -23,7 +23,7 @@ import math from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union -import shapely # type: ignore +import shapely.geometry.base # type: ignore import bigframes.core.compile.googlesql as googlesql @@ -33,9 +33,19 @@ import bigframes.core.ordering +# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. +try: + from shapely.io import to_wkt # type: ignore +except ImportError: + from shapely.wkt import dumps # type: ignore + + to_wkt = dumps + + ### Writing SQL Values (literals, column references, table references, etc.) def simple_literal(value: bytes | str | int | bool | float | datetime.datetime | None): """Return quoted input string.""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals if value is None: return "NULL" @@ -65,8 +75,8 @@ def simple_literal(value: bytes | str | int | bool | float | datetime.datetime | return f"DATE('{value.isoformat()}')" elif isinstance(value, datetime.time): return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))" - elif isinstance(value, shapely.Geometry): - return f"ST_GEOGFROMTEXT({simple_literal(shapely.to_wkt(value))})" + elif isinstance(value, shapely.geometry.base.BaseGeometry): + return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})" elif isinstance(value, decimal.Decimal): # TODO: disambiguate BIGNUMERIC based on scale and/or precision return f"CAST('{str(value)}' AS NUMERIC)" diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 684290bf81..ee09fc69cb 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -18,15 +18,11 @@ from typing import Hashable, Iterable, List import warnings -import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.common as vendored_pandas_io_common import numpy as np import pandas as pd -import pandas.api.types as pdtypes -import pyarrow as pa import typing_extensions -import bigframes.dtypes as dtypes import bigframes.exceptions as bfe UNNAMED_COLUMN_ID = "bigframes_unnamed_column" @@ -222,76 +218,3 @@ def timedelta_to_micros( ) * 1_000_000 + timedelta.microseconds raise TypeError(f"Unrecognized input type: {type(timedelta)}") - - -def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: - """ - Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored. - - Returns: - The names of updated columns - """ - updated_columns = [] - - for col in dataframe.columns: - if pdtypes.is_timedelta64_dtype(dataframe[col].dtype): - dataframe[col] = dataframe[col].apply(timedelta_to_micros) - updated_columns.append(col) - - if pdtypes.is_timedelta64_dtype(dataframe.index.dtype): - dataframe.index = dataframe.index.map(timedelta_to_micros) - updated_columns.append(dataframe.index.name) - - return updated_columns - - -def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool: - """ - Searches recursively for JSON array type within a PyArrow DataType. - """ - if arrow_type == dtypes.JSON_ARROW_TYPE: - return True - if pa.types.is_list(arrow_type): - return _search_for_nested_json_type(arrow_type.value_type) - if pa.types.is_struct(arrow_type): - for i in range(arrow_type.num_fields): - if _search_for_nested_json_type(arrow_type.field(i).type): - return True - return False - return False - - -def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: - """ - Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249), - we're using a workaround: storing JSON as strings and then parsing them into JSON - objects. - TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. - """ - updated_columns = [] - - for col in dataframe.columns: - column_type = dataframe[col].dtype - if column_type == dtypes.JSON_DTYPE: - dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE) - updated_columns.append(col) - elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type( - column_type.pyarrow_dtype - ): - raise NotImplementedError( - f"Nested JSON types, found in column `{col}`: `{column_type}`', " - f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" - ) - - if dataframe.index.dtype == dtypes.JSON_DTYPE: - dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE) - updated_columns.append(dataframe.index.name) - elif isinstance( - dataframe.index.dtype, pd.ArrowDtype - ) and _search_for_nested_json_type(dataframe.index.dtype.pyarrow_dtype): - raise NotImplementedError( - f"Nested JSON types, found in the index: `{dataframe.index.dtype}`', " - f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" - ) - - return updated_columns diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index 7758145fd4..1d888ca7e6 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,86 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations +from bigframes.core.window.rolling import Window -import typing - -import bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling - -from bigframes.core import log_adapter, window_spec -import bigframes.core.blocks as blocks -import bigframes.operations.aggregations as agg_ops - - -@log_adapter.class_logger -class Window(vendored_pandas_rolling.Window): - __doc__ = vendored_pandas_rolling.Window.__doc__ - - def __init__( - self, - block: blocks.Block, - window_spec: window_spec.WindowSpec, - value_column_ids: typing.Sequence[str], - drop_null_groups: bool = True, - is_series: bool = False, - ): - self._block = block - self._window_spec = window_spec - self._value_column_ids = value_column_ids - self._drop_null_groups = drop_null_groups - self._is_series = is_series - - def count(self): - return self._apply_aggregate(agg_ops.count_op) - - def sum(self): - return self._apply_aggregate(agg_ops.sum_op) - - def mean(self): - return self._apply_aggregate(agg_ops.mean_op) - - def var(self): - return self._apply_aggregate(agg_ops.var_op) - - def std(self): - return self._apply_aggregate(agg_ops.std_op) - - def max(self): - return self._apply_aggregate(agg_ops.max_op) - - def min(self): - return self._apply_aggregate(agg_ops.min_op) - - def _apply_aggregate( - self, - op: agg_ops.UnaryAggregateOp, - ): - block = self._block - labels = [block.col_id_to_label[col] for col in self._value_column_ids] - block, result_ids = block.multi_apply_window_op( - self._value_column_ids, - op, - self._window_spec, - skip_null_groups=self._drop_null_groups, - never_skip_nulls=True, - ) - - if self._window_spec.grouping_keys: - original_index_ids = block.index_columns - block = block.reset_index(drop=False) - index_ids = ( - *[col.id.name for col in self._window_spec.grouping_keys], - *original_index_ids, - ) - block = block.set_index(col_ids=index_ids) - - if self._is_series: - from bigframes.series import Series - - return Series(block.select_columns(result_ids).with_column_labels(labels)) - else: - from bigframes.dataframe import DataFrame - - return DataFrame( - block.select_columns(result_ids).with_column_labels(labels) - ) +__all__ = ["Window"] diff --git a/bigframes/core/window/ordering.py b/bigframes/core/window/ordering.py new file mode 100644 index 0000000000..0bea585bb0 --- /dev/null +++ b/bigframes/core/window/ordering.py @@ -0,0 +1,86 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from functools import singledispatch + +from bigframes.core import expression as ex +from bigframes.core import nodes, ordering + + +@singledispatch +def find_order_direction( + root: nodes.BigFrameNode, column_id: str +) -> ordering.OrderingDirection | None: + """Returns the order of the given column with tree traversal. If the column cannot be found, + or the ordering information is not available, return None. + """ + return None + + +@find_order_direction.register +def _(root: nodes.OrderByNode, column_id: str): + if len(root.by) == 0: + # This is a no-op + return find_order_direction(root.child, column_id) + + # Make sure the window key is the prefix of sorting keys. + order_expr = root.by[0] + scalar_expr = order_expr.scalar_expression + if isinstance(scalar_expr, ex.DerefOp) and scalar_expr.id.name == column_id: + return order_expr.direction + + return None + + +@find_order_direction.register +def _(root: nodes.ReversedNode, column_id: str): + direction = find_order_direction(root.child, column_id) + + if direction is None: + return None + return direction.reverse() + + +@find_order_direction.register +def _(root: nodes.SelectionNode, column_id: str): + for alias_ref in root.input_output_pairs: + if alias_ref.id.name == column_id: + return find_order_direction(root.child, alias_ref.ref.id.name) + + +@find_order_direction.register +def _(root: nodes.FilterNode, column_id: str): + return find_order_direction(root.child, column_id) + + +@find_order_direction.register +def _(root: nodes.InNode, column_id: str): + return find_order_direction(root.left_child, column_id) + + +@find_order_direction.register +def _(root: nodes.WindowOpNode, column_id: str): + return find_order_direction(root.child, column_id) + + +@find_order_direction.register +def _(root: nodes.ProjectionNode, column_id: str): + for expr, ref in root.assignments: + if ref.name == column_id and isinstance(expr, ex.DerefOp): + # This source column is renamed. + return find_order_direction(root.child, expr.id.name) + + return find_order_direction(root.child, column_id) diff --git a/bigframes/core/window/rolling.py b/bigframes/core/window/rolling.py new file mode 100644 index 0000000000..a9c6dfdfa7 --- /dev/null +++ b/bigframes/core/window/rolling.py @@ -0,0 +1,185 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import datetime +import typing + +import bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling +import numpy +import pandas + +from bigframes import dtypes +from bigframes.core import expression as ex +from bigframes.core import log_adapter, ordering, window_spec +import bigframes.core.blocks as blocks +from bigframes.core.window import ordering as window_ordering +import bigframes.operations.aggregations as agg_ops + + +@log_adapter.class_logger +class Window(vendored_pandas_rolling.Window): + __doc__ = vendored_pandas_rolling.Window.__doc__ + + def __init__( + self, + block: blocks.Block, + window_spec: window_spec.WindowSpec, + value_column_ids: typing.Sequence[str], + drop_null_groups: bool = True, + is_series: bool = False, + skip_agg_column_id: str | None = None, + ): + self._block = block + self._window_spec = window_spec + self._value_column_ids = value_column_ids + self._drop_null_groups = drop_null_groups + self._is_series = is_series + # The column ID that won't be aggregated on. + # This is equivalent to pandas `on` parameter in rolling() + self._skip_agg_column_id = skip_agg_column_id + + def count(self): + return self._apply_aggregate(agg_ops.count_op) + + def sum(self): + return self._apply_aggregate(agg_ops.sum_op) + + def mean(self): + return self._apply_aggregate(agg_ops.mean_op) + + def var(self): + return self._apply_aggregate(agg_ops.var_op) + + def std(self): + return self._apply_aggregate(agg_ops.std_op) + + def max(self): + return self._apply_aggregate(agg_ops.max_op) + + def min(self): + return self._apply_aggregate(agg_ops.min_op) + + def _apply_aggregate( + self, + op: agg_ops.UnaryAggregateOp, + ): + agg_block = self._aggregate_block(op) + + if self._is_series: + from bigframes.series import Series + + return Series(agg_block) + else: + from bigframes.dataframe import DataFrame + + # Preserve column order. + column_labels = [ + self._block.col_id_to_label[col_id] for col_id in self._value_column_ids + ] + return DataFrame(agg_block)._reindex_columns(column_labels) + + def _aggregate_block(self, op: agg_ops.UnaryAggregateOp) -> blocks.Block: + agg_col_ids = [ + col_id + for col_id in self._value_column_ids + if col_id != self._skip_agg_column_id + ] + block, result_ids = self._block.multi_apply_window_op( + agg_col_ids, + op, + self._window_spec, + skip_null_groups=self._drop_null_groups, + never_skip_nulls=True, + ) + + if self._window_spec.grouping_keys: + original_index_ids = block.index_columns + block = block.reset_index(drop=False) + index_ids = ( + *[col.id.name for col in self._window_spec.grouping_keys], + *original_index_ids, + ) + block = block.set_index(col_ids=index_ids) + + labels = [self._block.col_id_to_label[col] for col in agg_col_ids] + if self._skip_agg_column_id is not None: + result_ids = [self._skip_agg_column_id, *result_ids] + labels.insert(0, self._block.col_id_to_label[self._skip_agg_column_id]) + + return block.select_columns(result_ids).with_column_labels(labels) + + +def create_range_window( + block: blocks.Block, + window: pandas.Timedelta | numpy.timedelta64 | datetime.timedelta | str, + *, + value_column_ids: typing.Sequence[str] = tuple(), + min_periods: int | None, + on: str | None = None, + closed: typing.Literal["right", "left", "both", "neither"], + is_series: bool, + grouping_keys: typing.Sequence[str] = tuple(), + drop_null_groups: bool = True, +) -> Window: + + if on is None: + # Rolling on index + index_dtypes = block.index.dtypes + if len(index_dtypes) > 1: + raise ValueError("Range rolling on MultiIndex is not supported") + if index_dtypes[0] != dtypes.TIMESTAMP_DTYPE: + raise ValueError("Index type should be timestamps with timezones") + rolling_key_col_id = block.index_columns[0] + else: + # Rolling on a specific column + rolling_key_col_id = block.resolve_label_exact_or_error(on) + if block.expr.get_column_type(rolling_key_col_id) != dtypes.TIMESTAMP_DTYPE: + raise ValueError(f"Column {on} type should be timestamps with timezones") + + order_direction = window_ordering.find_order_direction( + block.expr.node, rolling_key_col_id + ) + if order_direction is None: + target_str = "index" if on is None else f"column {on}" + raise ValueError( + f"The {target_str} might not be in a monotonic order. Please sort by {target_str} before rolling." + ) + if isinstance(window, str): + window = pandas.Timedelta(window) + spec = window_spec.WindowSpec( + bounds=window_spec.RangeWindowBounds.from_timedelta_window(window, closed), + min_periods=1 if min_periods is None else min_periods, + ordering=( + ordering.OrderingExpression(ex.deref(rolling_key_col_id), order_direction), + ), + grouping_keys=tuple(ex.deref(col) for col in grouping_keys), + ) + + selected_value_col_ids = ( + value_column_ids if value_column_ids else block.value_columns + ) + # This step must be done after finding the order direction of the window key. + if grouping_keys: + block = block.order_by([ordering.ascending_over(col) for col in grouping_keys]) + + return Window( + block, + spec, + value_column_ids=selected_value_col_ids, + is_series=is_series, + skip_agg_column_id=None if on is None else rolling_key_col_id, + drop_null_groups=drop_null_groups, + ) diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index 142e3a7e00..d08ba3d12a 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -14,9 +14,13 @@ from __future__ import annotations from dataclasses import dataclass, replace +import datetime import itertools from typing import Literal, Mapping, Optional, Set, Tuple, Union +import numpy as np +import pandas as pd + import bigframes.core.expression as ex import bigframes.core.identifiers as ids import bigframes.core.ordering as orderings @@ -168,9 +172,31 @@ def __post_init__(self): @dataclass(frozen=True) class RangeWindowBounds: - # TODO(b/388916840) Support range rolling on timeseries with timedeltas. - start: Optional[int] = None - end: Optional[int] = None + """Represents a time range window, inclusively bounded by start and end""" + + start: pd.Timedelta | None = None + end: pd.Timedelta | None = None + + @classmethod + def from_timedelta_window( + cls, + window: pd.Timedelta | np.timedelta64 | datetime.timedelta, + closed: Literal["right", "left", "both", "neither"], + ) -> RangeWindowBounds: + window = pd.Timedelta(window) + tick = pd.Timedelta("1us") + zero = pd.Timedelta(0) + + if closed == "right": + return cls(-(window - tick), zero) + elif closed == "left": + return cls(-window, -tick) + elif closed == "both": + return cls(-window, zero) + elif closed == "neither": + return cls(-(window - tick), -tick) + else: + raise ValueError(f"Unsupported value for 'closed' parameter: {closed}") def __post_init__(self): if self.start is None: @@ -187,10 +213,12 @@ def __post_init__(self): class WindowSpec: """ Specifies a window over which aggregate and analytic function may be applied. - grouping_keys: set of column ids to group on - preceding: Number of preceding rows in the window - following: Number of preceding rows in the window - ordering: List of columns ids and ordering direction to override base ordering + + Attributes: + grouping_keys: A set of column ids to group on + bounds: The window boundaries + ordering: A list of columns ids and ordering direction to override base ordering + min_periods: The minimum number of observations in window required to have a value """ grouping_keys: Tuple[ex.DerefOp, ...] = tuple() @@ -199,7 +227,7 @@ class WindowSpec: min_periods: int = 0 @property - def row_bounded(self): + def is_row_bounded(self): """ Whether the window is bounded by row offsets. @@ -208,6 +236,26 @@ def row_bounded(self): """ return isinstance(self.bounds, RowsWindowBounds) + @property + def is_range_bounded(self): + """ + Whether the window is bounded by range offsets. + + This is relevant for determining whether the window requires a total order + to calculate deterministically. + """ + return isinstance(self.bounds, RangeWindowBounds) + + @property + def is_unbounded(self): + """ + Whether the window is unbounded. + + This is relevant for determining whether the window requires a total order + to calculate deterministically. + """ + return self.bounds is None + @property def all_referenced_columns(self) -> Set[ids.ColumnId]: """ @@ -220,7 +268,7 @@ def all_referenced_columns(self) -> Set[ids.ColumnId]: def without_order(self) -> WindowSpec: """Removes ordering clause if ordering isn't required to define bounds.""" - if self.row_bounded: + if self.is_row_bounded: raise ValueError("Cannot remove order from row-bounded window") return replace(self, ordering=()) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7f9e62b7dd..95ea487786 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -67,12 +67,12 @@ import bigframes.core.utils as utils import bigframes.core.validations as validations import bigframes.core.window +from bigframes.core.window import rolling import bigframes.core.window_spec as windows import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.formatting_helpers as formatter import bigframes.operations as ops -import bigframes.operations.aggregations import bigframes.operations.aggregations as agg_ops import bigframes.operations.ai import bigframes.operations.plotting as plotting @@ -1634,19 +1634,62 @@ def to_pandas( ) -> pandas.DataFrame | pandas.Series: """Write DataFrame to pandas DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col': [4, 2, 2]}) + + Download the data from BigQuery and convert it into an in-memory pandas DataFrame. + + >>> df.to_pandas() + col + 0 4 + 1 2 + 2 2 + + Estimate job statistics without processing or downloading data by using `dry_run=True`. + + >>> df.to_pandas(dry_run=True) # doctest: +SKIP + columnCount 1 + columnDtypes {'col': Int64} + indexLevel 1 + indexDtypes [Int64] + projectId bigframes-dev + location US + jobType QUERY + destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_... + useLegacySql False + referencedTables None + totalBytesProcessed 0 + cacheHit False + statementType SELECT + creationTime 2025-04-02 20:17:12.038000+00:00 + dtype: object + Args: max_download_size (int, default None): - Download size threshold in MB. If max_download_size is exceeded when downloading data - (e.g., to_pandas()), the data will be downsampled if - bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be - raised. If set to a value other than None, this will supersede the global config. + .. deprecated:: 2.0.0 + ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()`` + method instead. + + Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data, + the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is + ``True``, otherwise, an error will be raised. If set to a value other than ``None``, + this will supersede the global config. sampling_method (str, default None): + .. deprecated:: 2.0.0 + ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead. + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal computations to perform the downsampling; "uniform": This algorithm returns uniform random samples of the data. If set to a value other than None, this will supersede the global config. random_state (int, default None): + .. deprecated:: 2.0.0 + ``random_state`` parameter is deprecated. Please use ``sample()`` method instead. + The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. @@ -1666,8 +1709,19 @@ def to_pandas( downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas Series containing dry run statistics will be returned. """ - - # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job + if max_download_size is not None: + msg = bfe.format_message( + "DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` " + "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`." + ) + warnings.warn(msg, category=FutureWarning) + if sampling_method is not None or random_state is not None: + msg = bfe.format_message( + "DEPRECATED: The `sampling_method` and `random_state` parameters for " + "`DataFrame.to_pandas()` are deprecated and will be removed soon. " + "Please use `DataFrame.sample().to_pandas()` instead for sampling." + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( @@ -1702,11 +1756,40 @@ def to_pandas_batches( page_size and max_results determine the size and number of batches, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) + + Iterate through the results in batches, limiting the total rows yielded + across all batches via `max_results`: + + >>> for df_batch in df.to_pandas_batches(max_results=3): + ... print(df_batch) + col + 0 4 + 1 3 + 2 2 + + Alternatively, control the approximate size of each batch using `page_size` + and fetch batches manually using `next()`: + + >>> it = df.to_pandas_batches(page_size=2) + >>> next(it) + col + 0 4 + 1 3 + >>> next(it) + col + 2 2 + 3 2 + Args: page_size (int, default None): - The size of each batch. + The maximum number of rows of each batch. Non-positive values are ignored. max_results (int, default None): - If given, only download this many rows at maximum. + The maximum total number of rows of all batches. allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. @@ -3310,16 +3393,33 @@ def _perform_join_by_index( @validations.requires_ordering() def rolling( self, - window: int, + window: int | pandas.Timedelta | numpy.timedelta64 | datetime.timedelta | str, min_periods=None, + on: str | None = None, closed: Literal["right", "left", "both", "neither"] = "right", ) -> bigframes.core.window.Window: - window_def = windows.WindowSpec( - bounds=windows.RowsWindowBounds.from_window_size(window, closed), - min_periods=min_periods if min_periods is not None else window, - ) - return bigframes.core.window.Window( - self._block, window_def, self._block.value_columns + if isinstance(window, int): + window_def = windows.WindowSpec( + bounds=windows.RowsWindowBounds.from_window_size(window, closed), + min_periods=min_periods if min_periods is not None else window, + ) + skip_agg_col_id = ( + None if on is None else self._block.resolve_label_exact_or_error(on) + ) + return bigframes.core.window.Window( + self._block, + window_def, + self._block.value_columns, + skip_agg_column_id=skip_agg_col_id, + ) + + return rolling.create_range_window( + self._block, + window, + min_periods=min_periods, + on=on, + closed=closed, + is_series=False, ) @validations.requires_ordering() @@ -3483,7 +3583,7 @@ def pct_change(self, periods: int = 1) -> DataFrame: def _apply_window_op( self, - op: agg_ops.WindowOp, + op: agg_ops.UnaryWindowOp, window_spec: windows.WindowSpec, ): block, result_ids = self._block.multi_apply_window_op( @@ -3768,7 +3868,7 @@ def to_gbq( # The client code owns this table reference now temp_table_ref = ( - self._session._temp_storage_manager.generate_unique_resource_id() + self._session._anon_dataset_manager.generate_unique_resource_id() ) destination_table = f"{temp_table_ref.project}.{temp_table_ref.dataset_id}.{temp_table_ref.table_id}" diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 22cc521e8e..47b128dae6 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -28,7 +28,7 @@ import numpy as np import pandas as pd import pyarrow as pa -import shapely # type: ignore +import shapely.geometry # type: ignore # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -352,6 +352,24 @@ def is_comparable(type_: ExpressionType) -> bool: return (type_ is not None) and is_orderable(type_) +def get_struct_fields(type_: ExpressionType) -> dict[str, Dtype]: + assert isinstance(type_, pd.ArrowDtype) + assert isinstance(type_.pyarrow_dtype, pa.StructType) + struct_type = type_.pyarrow_dtype + result: dict[str, Dtype] = {} + for field_no in range(struct_type.num_fields): + field = struct_type.field(field_no) + result[field.name] = arrow_dtype_to_bigframes_dtype(field.type) + return result + + +def get_array_inner_type(type_: ExpressionType) -> Dtype: + assert isinstance(type_, pd.ArrowDtype) + assert isinstance(type_.pyarrow_dtype, pa.ListType) + list_type = type_.pyarrow_dtype + return arrow_dtype_to_bigframes_dtype(list_type.value_type) + + _ORDERABLE_SIMPLE_TYPES = set( mapping.dtype for mapping in SIMPLE_TYPES if mapping.orderable ) @@ -456,6 +474,8 @@ def bigframes_dtype_to_arrow_dtype( if bigframes_dtype in _BIGFRAMES_TO_ARROW: return _BIGFRAMES_TO_ARROW[bigframes_dtype] if isinstance(bigframes_dtype, pd.ArrowDtype): + if pa.types.is_duration(bigframes_dtype.pyarrow_dtype): + return bigframes_dtype.pyarrow_dtype if pa.types.is_list(bigframes_dtype.pyarrow_dtype): return bigframes_dtype.pyarrow_dtype if pa.types.is_struct(bigframes_dtype.pyarrow_dtype): @@ -486,7 +506,7 @@ def bigframes_dtype_to_literal( if isinstance(bigframes_dtype, pd.StringDtype): return "string" if isinstance(bigframes_dtype, gpd.array.GeometryDtype): - return shapely.Point((0, 0)) + return shapely.geometry.Point((0, 0)) raise TypeError( f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" @@ -697,9 +717,10 @@ def convert_schema_field( def convert_to_schema_field( - name: str, - bigframes_dtype: Dtype, + name: str, bigframes_dtype: Dtype, overrides: dict[Dtype, str] = {} ) -> google.cloud.bigquery.SchemaField: + if bigframes_dtype in overrides: + return google.cloud.bigquery.SchemaField(name, overrides[bigframes_dtype]) if bigframes_dtype in _BIGFRAMES_TO_TK: return google.cloud.bigquery.SchemaField( name, _BIGFRAMES_TO_TK[bigframes_dtype] @@ -709,7 +730,7 @@ def convert_to_schema_field( inner_type = arrow_dtype_to_bigframes_dtype( bigframes_dtype.pyarrow_dtype.value_type ) - inner_field = convert_to_schema_field(name, inner_type) + inner_field = convert_to_schema_field(name, inner_type, overrides) return google.cloud.bigquery.SchemaField( name, inner_field.field_type, mode="REPEATED", fields=inner_field.fields ) @@ -719,7 +740,9 @@ def convert_to_schema_field( for i in range(struct_type.num_fields): field = struct_type.field(i) inner_bf_type = arrow_dtype_to_bigframes_dtype(field.type) - inner_fields.append(convert_to_schema_field(field.name, inner_bf_type)) + inner_fields.append( + convert_to_schema_field(field.name, inner_bf_type, overrides) + ) return google.cloud.bigquery.SchemaField( name, "RECORD", fields=inner_fields diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 44aea57898..8a591f6916 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -196,6 +196,7 @@ def provision_bq_managed_function( name, packages, is_row_processor, + bq_connection_id, *, capture_references=False, ): @@ -273,12 +274,21 @@ def provision_bq_managed_function( udf_code = textwrap.dedent(inspect.getsource(func)) udf_code = udf_code[udf_code.index("def") :] + with_connection_clause = ( + ( + f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`" + ) + if bq_connection_id + else "" + ) + create_function_ddl = ( textwrap.dedent( f""" CREATE OR REPLACE FUNCTION {persistent_func_id}({','.join(bq_function_args)}) RETURNS {bq_function_return_type} LANGUAGE python + {with_connection_clause} OPTIONS ({managed_function_options_str}) AS r''' __UDF_PLACE_HOLDER__ @@ -365,7 +375,7 @@ def create_cloud_function( is_row_processor=False, vpc_connector=None, memory_mib=1024, - ingress_settings="all", + ingress_settings="internal-only", ): """Create a cloud function from the given user defined function.""" diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index c04de54be6..ec0e977782 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -167,7 +167,7 @@ def _resolve_bigquery_connection_id( if not bigquery_connection: bigquery_connection = session._bq_connection # type: ignore - bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection = clients.get_canonical_bq_connection_id( bigquery_connection, default_project=dataset_ref.project, default_location=bq_location, @@ -237,6 +237,7 @@ def _try_delattr(self, func: Callable, attr: str) -> None: # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py def remote_function( self, + *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, session: Optional[Session] = None, @@ -251,7 +252,7 @@ def remote_function( reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, + cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, @@ -259,9 +260,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Optional[ - Literal["all", "internal-only", "internal-and-gclb"] - ] = None, + cloud_function_ingress_settings: Literal[ + "all", "internal-only", "internal-and-gclb" + ] = "internal-only", ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -384,8 +385,8 @@ def remote_function( Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided then + cloud_function_service_account (str): + Service account to use for the cloud functions. If "default" provided then the default service account would be used. See https://cloud.google.com/functions/docs/securing/function-identity for more details. Please make sure the service account has the @@ -448,29 +449,20 @@ def remote_function( https://cloud.google.com/functions/docs/configuring/memory. cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the - function. By default `all` will be used. It must be one of: - `all`, `internal-only`, `internal-and-gclb`. See for more details + function. Options are: `all`, `internal-only`, or `internal-and-gclb`. + If no setting is provided, `internal-only` will be used by default. + See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. """ # Some defaults may be used from the session if not provided otherwise. session = self._resolve_session(session) - # raise a UserWarning if user does not explicitly set cloud_function_service_account to a - # user-managed cloud_function_service_account of to default - msg = bfe.format_message( - "You have not explicitly set a user-managed `cloud_function_service_account`. " - "Using the default Compute Engine service account. " - "In BigFrames 2.0 onwards, you would have to explicitly set `cloud_function_service_account` " - 'either to a user-managed service account (preferred) or to `"default"` ' - "to use the default Compute Engine service account (discouraged). " - "See, https://cloud.google.com/functions/docs/securing/function-identity." - ) - + # If the user forces the cloud function service argument to None, throw + # an exception if cloud_function_service_account is None: - warnings.warn(msg, stacklevel=2, category=FutureWarning) - - if cloud_function_service_account == "default": - cloud_function_service_account = None + raise ValueError( + 'You must provide a user managed cloud_function_service_account, or "default" if you would like to let the default service account be used.' + ) # A BigQuery client is required to perform BQ operations. bigquery_client = self._resolve_bigquery_client(session, bigquery_client) @@ -516,24 +508,11 @@ def remote_function( ) if cloud_function_ingress_settings is None: - cloud_function_ingress_settings = "all" + cloud_function_ingress_settings = "internal-only" msg = bfe.format_message( - "The `cloud_function_ingress_settings` are set to 'all' by default, " - "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " - "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " - "See https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings for details." + "The `cloud_function_ingress_settings` is being set to 'internal-only' by default." ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - if cloud_function_ingress_settings is None: - cloud_function_ingress_settings = "all" - msg = bfe.format_message( - "The `cloud_function_ingress_settings` are set to 'all' by default, " - "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " - "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " - "See https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings for details." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) + warnings.warn(msg, category=UserWarning, stacklevel=2) bq_connection_manager = session.bqconnectionmanager @@ -615,7 +594,9 @@ def wrapper(func): bq_connection_manager, cloud_function_region, cloud_functions_client, - cloud_function_service_account, + None + if cloud_function_service_account == "default" + else cloud_function_service_account, cloud_function_kms_key_name, cloud_function_docker_repository, session=session, # type: ignore @@ -794,6 +775,13 @@ def udf( Name of the BigQuery connection. It is used to provide an identity to the serverless instances running the user code. It helps BigQuery manage and track the resources used by the udf. + This connection is required for internet access and for + interacting with other GCP services. To access GCP services, the + appropriate IAM permissions must also be granted to the + connection's Service Account. When it defaults to None, the udf + will be created without any connection. A udf without a + connection has no internet access and no access to other GCP + services. name (str, Optional): Explicit name of the persisted BigQuery managed function. Use it with caution, because more than one users working in the same @@ -805,7 +793,7 @@ def udf( ``bigframes.pandas.reset_session``/ ``bigframes.pandas.clean_up_by_session_id``) does not clean up the function, and leaves it for the user to manage the function - and the associated cloud function directly. + directly. packages (str[], Optional): Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be @@ -826,9 +814,13 @@ def udf( bq_location, _ = _utils.get_remote_function_locations(bigquery_client.location) - # A connection is required for BQ managed function. - bq_connection_id = self._resolve_bigquery_connection_id( - session, dataset_ref, bq_location, bigquery_connection + # A connection is optional for BQ managed function. + bq_connection_id = ( + self._resolve_bigquery_connection_id( + session, dataset_ref, bq_location, bigquery_connection + ) + if bigquery_connection + else None ) bq_connection_manager = session.bqconnectionmanager @@ -926,6 +918,7 @@ def wrapper(func): name=name, packages=packages, is_row_processor=is_row_processor, + bq_connection_id=bq_connection_id, ) # TODO(shobs): Find a better way to support udfs with param named diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 30b3d23056..858c25fada 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -27,9 +27,7 @@ from bigframes.session import Session import google.api_core.exceptions -import google.api_core.retry from google.cloud import bigquery -import google.iam.v1 import bigframes.core.compile.ibis_types import bigframes.dtypes diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 01917fd6d8..81637333b0 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -117,6 +117,12 @@ def model(self) -> bigquery.Model: """Get the BQML model associated with this wrapper""" return self._model + def recommend(self, input_data: bpd.DataFrame) -> bpd.DataFrame: + return self._apply_ml_tvf( + input_data, + self._model_manipulation_sql_generator.ml_recommend, + ) + def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index c98e18322a..ece950a5a2 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -19,6 +19,7 @@ from typing import List, Literal, Optional, Union +import bigframes_vendored.sklearn.decomposition._mf import bigframes_vendored.sklearn.decomposition._pca from google.cloud import bigquery @@ -27,7 +28,15 @@ import bigframes.pandas as bpd import bigframes.session -_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"} +_BQML_PARAMS_MAPPING = { + "svd_solver": "pcaSolver", + "feedback_type": "feedbackType", + "num_factors": "numFactors", + "user_col": "userColumn", + "item_col": "itemColumn", + "_input_label_columns": "inputLabelColumns", + "l2_reg": "l2Regularization", +} @log_adapter.class_logger @@ -197,3 +206,159 @@ def score( # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE. return self._bqml_model.evaluate() + + +@log_adapter.class_logger +class MatrixFactorization( + base.UnsupervisedTrainablePredictor, + bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization, +): + __doc__ = bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization.__doc__ + + def __init__( + self, + *, + feedback_type: Literal["explicit", "implicit"] = "explicit", + num_factors: int, + user_col: str, + item_col: str, + rating_col: str = "rating", + # TODO: Add support for hyperparameter tuning. + l2_reg: float = 1.0, + ): + + feedback_type = feedback_type.lower() # type: ignore + if feedback_type not in ("explicit", "implicit"): + raise ValueError("Expected feedback_type to be `explicit` or `implicit`.") + + self.feedback_type = feedback_type + + if not isinstance(num_factors, int): + raise TypeError( + f"Expected num_factors to be an int, but got {type(num_factors)}." + ) + + if num_factors < 0: + raise ValueError( + f"Expected num_factors to be a positive integer, but got {num_factors}." + ) + + self.num_factors = num_factors + + if not isinstance(user_col, str): + raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.") + + self.user_col = user_col + + if not isinstance(item_col, str): + raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") + + self.item_col = item_col + + if not isinstance(rating_col, str): + raise TypeError( + f"Expected rating_col to be a str, but got {type(rating_col)}." + ) + + self._input_label_columns = [rating_col] + + if not isinstance(l2_reg, (float, int)): + raise TypeError( + f"Expected l2_reg to be a float or int, but got {type(l2_reg)}." + ) + + self.l2_reg = l2_reg + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + + @property + def rating_col(self) -> str: + """str: The rating column name. Defaults to 'rating'.""" + return self._input_label_columns[0] + + @classmethod + def _from_bq( + cls, session: bigframes.session.Session, bq_model: bigquery.Model + ) -> MatrixFactorization: + assert bq_model.model_type == "MATRIX_FACTORIZATION" + + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) + + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model + + @property + def _bqml_options(self) -> dict: + """The model options as they will be set for BQML""" + options: dict = { + "model_type": "matrix_factorization", + "feedback_type": self.feedback_type, + "user_col": self.user_col, + "item_col": self.item_col, + "rating_col": self.rating_col, + "l2_reg": self.l2_reg, + } + + if self.num_factors is not None: + options["num_factors"] = self.num_factors + + return options + + def _fit( + self, + X: utils.ArrayType, + y=None, + transforms: Optional[List[str]] = None, + ) -> MatrixFactorization: + if y is not None: + raise ValueError( + "Label column not supported for Matrix Factorization model but y was not `None`" + ) + + (X,) = utils.batch_convert_to_dataframe(X) + + self._bqml_model = self._bqml_model_factory.create_model( + X_train=X, + transforms=transforms, + options=self._bqml_options, + ) + return self + + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before recommend") + + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + + return self._bqml_model.recommend(X) + + def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization: + """Save the model to BigQuery. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Determine whether to replace if the model already exists. Default to False. + + Returns: + MatrixFactorization: Saved model.""" + if not self._bqml_model: + raise RuntimeError("A model must be fitted before it can be saved") + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + def score( + self, + X=None, + y=None, + ) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + + # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE. + return self._bqml_model.evaluate() diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 1fd9fbc4a7..cce05ea1f2 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -21,9 +21,8 @@ import bigframes_vendored.constants as constants from google.cloud import bigquery -import typing_extensions -from bigframes import clients, dtypes, exceptions +from bigframes import dtypes, exceptions import bigframes.bigquery as bbq from bigframes.core import blocks, global_session, log_adapter import bigframes.dataframe @@ -34,20 +33,6 @@ "max_iterations": "maxIterations", } -_TEXT_GENERATOR_BISON_ENDPOINT = "text-bison" -_TEXT_GENERATOR_BISON_32K_ENDPOINT = "text-bison-32k" -_TEXT_GENERATOR_ENDPOINTS = ( - _TEXT_GENERATOR_BISON_ENDPOINT, - _TEXT_GENERATOR_BISON_32K_ENDPOINT, -) - -_EMBEDDING_GENERATOR_GECKO_ENDPOINT = "textembedding-gecko" -_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT = "textembedding-gecko-multilingual" -_PALM2_EMBEDDING_GENERATOR_ENDPOINTS = ( - _EMBEDDING_GENERATOR_GECKO_ENDPOINT, - _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, -) - _TEXT_EMBEDDING_005_ENDPOINT = "text-embedding-005" _TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004" _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002" @@ -59,7 +44,6 @@ _MULTIMODAL_EMBEDDING_001_ENDPOINT = "multimodalembedding@001" -_GEMINI_PRO_ENDPOINT = "gemini-pro" _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" _GEMINI_1P5_PRO_001_ENDPOINT = "gemini-1.5-pro-001" @@ -67,8 +51,9 @@ _GEMINI_1P5_FLASH_001_ENDPOINT = "gemini-1.5-flash-001" _GEMINI_1P5_FLASH_002_ENDPOINT = "gemini-1.5-flash-002" _GEMINI_2_FLASH_EXP_ENDPOINT = "gemini-2.0-flash-exp" +_GEMINI_2_FLASH_001_ENDPOINT = "gemini-2.0-flash-001" +_GEMINI_2_FLASH_LITE_001_ENDPOINT = "gemini-2.0-flash-lite-001" _GEMINI_ENDPOINTS = ( - _GEMINI_PRO_ENDPOINT, _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT, @@ -76,6 +61,8 @@ _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, _GEMINI_2_FLASH_EXP_ENDPOINT, + _GEMINI_2_FLASH_001_ENDPOINT, + _GEMINI_2_FLASH_LITE_001_ENDPOINT, ) _GEMINI_PREVIEW_ENDPOINTS = ( _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, @@ -83,7 +70,6 @@ _GEMINI_2_FLASH_EXP_ENDPOINT, ) _GEMINI_FINE_TUNE_SCORE_ENDPOINTS = ( - _GEMINI_PRO_ENDPOINT, _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, ) @@ -108,7 +94,6 @@ _ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" -_ML_EMBED_TEXT_STATUS = "ml_embed_text_status" _ML_GENERATE_EMBEDDING_STATUS = "ml_generate_embedding_status" _MODEL_NOT_SUPPORTED_WARNING = ( @@ -118,514 +103,7 @@ "You should use this model name only if you are sure that it is supported in BigQuery." ) - -@typing_extensions.deprecated( - "PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", - category=exceptions.ApiDeprecationWarning, -) -@log_adapter.class_logger -class PaLM2TextGenerator(base.BaseEstimator): - """PaLM2 text generator LLM model. - - .. note:: - PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. - - Args: - model_name (str, Default to "text-bison"): - The model for natural language tasks. “text-bison” returns model fine-tuned to follow natural language instructions - and is suitable for a variety of language tasks. "text-bison-32k" supports up to 32k tokens per request. - Default to "text-bison". - session (bigframes.Session or None): - BQ session to create the model. If None, use the global default session. - connection_name (str or None): - Connection to connect with remote service. str of the format ... - If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach - permission if the connection isn't fully set up. - max_iterations (Optional[int], Default to 300): - The number of steps to run when performing supervised tuning. - """ - - def __init__( - self, - *, - model_name: Literal["text-bison", "text-bison-32k"] = "text-bison", - session: Optional[bigframes.Session] = None, - connection_name: Optional[str] = None, - max_iterations: int = 300, - ): - self.model_name = model_name - self.session = session or global_session.get_global_session() - self.max_iterations = max_iterations - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) - - self._bqml_model_factory = globals.bqml_model_factory() - self._bqml_model: core.BqmlModel = self._create_bqml_model() - - def _create_bqml_model(self): - # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) - - if self.model_name not in _TEXT_GENERATOR_ENDPOINTS: - msg = exceptions.format_message( - _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_TEXT_GENERATOR_ENDPOINTS), - ) - ) - warnings.warn(msg) - - options = { - "endpoint": self.model_name, - } - - return self._bqml_model_factory.create_remote_model( - session=self.session, connection_name=self.connection_name, options=options - ) - - @classmethod - def _from_bq( - cls, session: bigframes.Session, bq_model: bigquery.Model - ) -> PaLM2TextGenerator: - assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in bq_model._properties - assert "endpoint" in bq_model._properties["remoteModelInfo"] - assert "connection" in bq_model._properties["remoteModelInfo"] - - # Parse the remote model endpoint - bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] - model_connection = bq_model._properties["remoteModelInfo"]["connection"] - model_endpoint = bqml_endpoint.split("/")[-1] - - kwargs = utils.retrieve_params_from_bq_model( - cls, bq_model, _BQML_PARAMS_MAPPING - ) - - model = cls( - **kwargs, - session=session, - model_name=model_endpoint, - connection_name=model_connection, - ) - model._bqml_model = core.BqmlModel(session, bq_model) - return model - - @property - def _bqml_options(self) -> dict: - """The model options as they will be set for BQML""" - options = { - "max_iterations": self.max_iterations, - "data_split_method": "NO_SPLIT", - } - return options - - def fit( - self, - X: utils.ArrayType, - y: utils.ArrayType, - ) -> PaLM2TextGenerator: - """Fine tune PaLM2TextGenerator model. - - .. note:: - - This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the - Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" - and might have limited support. For more information, see the launch stage descriptions - (https://cloud.google.com/products#product-launch-stages). - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Training labels. - - Returns: - PaLM2TextGenerator: Fitted estimator. - """ - X, y = utils.batch_convert_to_dataframe(X, y) - - options = self._bqml_options - options["endpoint"] = self.model_name + "@001" - options["prompt_col"] = X.columns.tolist()[0] - - self._bqml_model = self._bqml_model_factory.create_llm_remote_model( - X, - y, - options=options, - connection_name=self.connection_name, - ) - return self - - def predict( - self, - X: utils.ArrayType, - *, - temperature: float = 0.0, - max_output_tokens: int = 128, - top_k: int = 40, - top_p: float = 0.95, - ) -> bigframes.dataframe.DataFrame: - """Predict the result from input DataFrame. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. - Prompts can include preamble, questions, suggestions, instructions, or examples. - - temperature (float, default 0.0): - The temperature is used for sampling during the response generation, which occurs when topP and topK are applied. - Temperature controls the degree of randomness in token selection. Lower temperatures are good for prompts that expect a true or correct response, - while higher temperatures can lead to more diverse or unexpected results. A temperature of 0 is deterministic: - the highest probability token is always selected. For most use cases, try starting with a temperature of 0.2. - Default 0. Possible values [0.0, 1.0]. - - max_output_tokens (int, default 128): - Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. - A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. - Default 128. For the 'text-bison' model, possible values are in the range [1, 1024]. For the 'text-bison-32k' model, possible values are in the range [1, 8192]. - Please ensure that the specified value for max_output_tokens is within the appropriate range for the model being used. - - top_k (int, default 40): - Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens - in the model's vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). - For each token selection step, the top K tokens with the highest probabilities are sampled. Then tokens are further filtered based on topP with the final token selected using temperature sampling. - Specify a lower value for less random responses and a higher value for more random responses. - Default 40. Possible values [1, 40]. - - top_p (float, default 0.95):: - Top-p changes how the model selects tokens for output. Tokens are selected from most K (see topK parameter) probable to least until the sum of their probabilities equals the top-p value. - For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-p value is 0.5, then the model will select either A or B as the next token (using temperature) - and not consider C at all. - Specify a lower value for less random responses and a higher value for more random responses. - Default 0.95. Possible values [0.0, 1.0]. - - - Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. - """ - - # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - if temperature < 0.0 or temperature > 1.0: - raise ValueError(f"temperature must be [0.0, 1.0], but is {temperature}.") - - if ( - self.model_name == _TEXT_GENERATOR_BISON_ENDPOINT - and max_output_tokens not in range(1, 1025) - ): - raise ValueError( - f"max_output_token must be [1, 1024] for TextBison model, but is {max_output_tokens}." - ) - - if ( - self.model_name == _TEXT_GENERATOR_BISON_32K_ENDPOINT - and max_output_tokens not in range(1, 8193) - ): - raise ValueError( - f"max_output_token must be [1, 8192] for TextBison 32k model, but is {max_output_tokens}." - ) - - if top_k not in range(1, 41): - raise ValueError(f"top_k must be [1, 40], but is {top_k}.") - - if top_p < 0.0 or top_p > 1.0: - raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - - if len(X.columns) == 1: - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "prompt"}) - - options = { - "temperature": temperature, - "max_output_tokens": max_output_tokens, - "top_k": top_k, - "top_p": top_p, - "flatten_json_output": True, - } - - df = self._bqml_model.generate_text(X, options) - - if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): - msg = exceptions.format_message( - f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for " - "detailed status. You may want to filter the failed rows and retry." - ) - warnings.warn(msg, category=RuntimeWarning) - - return df - - def score( - self, - X: utils.ArrayType, - y: utils.ArrayType, - task_type: Literal[ - "text_generation", "classification", "summarization", "question_answering" - ] = "text_generation", - ) -> bigframes.dataframe.DataFrame: - """Calculate evaluation metrics of the model. - - .. note:: - - This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the - Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" - and might have limited support. For more information, see the launch stage descriptions - (https://cloud.google.com/products#product-launch-stages). - - .. note:: - - Output matches that of the BigQuery ML.EVALUATE function. - See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm - for the outputs relevant to this model type. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - A BigQuery DataFrame as evaluation data, which contains only one column of input_text - that contains the prompt text to use when evaluating the model. - y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - A BigQuery DataFrame as evaluation labels, which contains only one column of output_text - that you would expect to be returned by the model. - task_type (str): - The type of the task for LLM model. Default to "text_generation". - Possible values: "text_generation", "classification", "summarization", and "question_answering". - - Returns: - bigframes.dataframe.DataFrame: The DataFrame as evaluation result. - """ - if not self._bqml_model: - raise RuntimeError("A model must be fitted before score") - - X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session) - - if len(X.columns) != 1 or len(y.columns) != 1: - raise ValueError( - f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - X_col_label = cast(blocks.Label, X.columns[0]) - y_col_label = cast(blocks.Label, y.columns[0]) - X = X.rename(columns={X_col_label: "input_text"}) - y = y.rename(columns={y_col_label: "output_text"}) - - input_data = X.join(y, how="outer") - - return self._bqml_model.llm_evaluate(input_data, task_type) - - def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: - """Save the model to BigQuery. - - Args: - model_name (str): - The name of the model. - replace (bool, default False): - Determine whether to replace if the model already exists. Default to False. - - Returns: - PaLM2TextGenerator: Saved model.""" - - new_model = self._bqml_model.copy(model_name, replace) - return new_model.session.read_gbq_model(model_name) - - -@typing_extensions.deprecated( - "PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", - category=exceptions.ApiDeprecationWarning, -) -@log_adapter.class_logger -class PaLM2TextEmbeddingGenerator(base.BaseEstimator): - """PaLM2 text embedding generator LLM model. - - .. note:: - PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. - - - Args: - model_name (str, Default to "textembedding-gecko"): - The model for text embedding. “textembedding-gecko” returns model embeddings for text inputs. - "textembedding-gecko-multilingual" returns model embeddings for text inputs which support over 100 languages. - Default to "textembedding-gecko". - version (str or None): - Model version. Accepted values are "001", "002", "003", "latest" etc. Will use the default version if unset. - See https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning for details. - session (bigframes.Session or None): - BQ session to create the model. If None, use the global default session. - connection_name (str or None): - Connection to connect with remote service. str of the format ... - If None, use default connection in session context. - """ - - def __init__( - self, - *, - model_name: Literal[ - "textembedding-gecko", "textembedding-gecko-multilingual" - ] = "textembedding-gecko", - version: Optional[str] = None, - session: Optional[bigframes.Session] = None, - connection_name: Optional[str] = None, - ): - self.model_name = model_name - self.version = version - self.session = session or global_session.get_global_session() - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) - - self._bqml_model_factory = globals.bqml_model_factory() - self._bqml_model: core.BqmlModel = self._create_bqml_model() - - def _create_bqml_model(self): - # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) - - if self.model_name not in _PALM2_EMBEDDING_GENERATOR_ENDPOINTS: - msg = exceptions.format_message( - _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS), - ) - ) - warnings.warn(msg) - - endpoint = ( - self.model_name + "@" + self.version if self.version else self.model_name - ) - options = { - "endpoint": endpoint, - } - return self._bqml_model_factory.create_remote_model( - session=self.session, connection_name=self.connection_name, options=options - ) - - @classmethod - def _from_bq( - cls, session: bigframes.Session, bq_model: bigquery.Model - ) -> PaLM2TextEmbeddingGenerator: - assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in bq_model._properties - assert "endpoint" in bq_model._properties["remoteModelInfo"] - assert "connection" in bq_model._properties["remoteModelInfo"] - - # Parse the remote model endpoint - bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] - model_connection = bq_model._properties["remoteModelInfo"]["connection"] - model_endpoint = bqml_endpoint.split("/")[-1] - - model_name, version = utils.parse_model_endpoint(model_endpoint) - - model = cls( - session=session, - # str to literals - model_name=model_name, # type: ignore - version=version, - connection_name=model_connection, - ) - - model._bqml_model = core.BqmlModel(session, bq_model) - return model - - def predict(self, X: utils.ArrayType) -> bigframes.dataframe.DataFrame: - """Predict the result from input DataFrame. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. - - Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. - """ - - # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - - if len(X.columns) == 1: - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "content"}) - - options = { - "flatten_json_output": True, - } - - df = self._bqml_model.generate_embedding(X, options) - df = df.rename( - columns={ - "ml_generate_embedding_result": "text_embedding", - "ml_generate_embedding_statistics": "statistics", - "ml_generate_embedding_status": _ML_EMBED_TEXT_STATUS, - } - ) - - if (df[_ML_EMBED_TEXT_STATUS] != "").any(): - msg = exceptions.format_message( - f"Some predictions failed. Check column {_ML_EMBED_TEXT_STATUS} for " - "detailed status. You may want to filter the failed rows and retry." - ) - warnings.warn(msg, category=RuntimeWarning) - - return df - - def to_gbq( - self, model_name: str, replace: bool = False - ) -> PaLM2TextEmbeddingGenerator: - """Save the model to BigQuery. - - Args: - model_name (str): - The name of the model. - replace (bool, default False): - Determine whether to replace if the model already exists. Default to False. - - Returns: - PaLM2TextEmbeddingGenerator: Saved model.""" - - new_model = self._bqml_model.copy(model_name, replace) - return new_model.session.read_gbq_model(model_name) +_REMOVE_DEFAULT_MODEL_WARNING = "Since upgrading the default model can cause unintended breakages, the default model will be removed in BigFrames 3.0. Please supply an explicit model to avoid this message." @log_adapter.class_logger @@ -637,7 +115,8 @@ class TextEmbeddingGenerator(base.RetriableRemotePredictor): The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004" or "text-multilingual-embedding-002". text-embedding models returns model embeddings for text inputs. text-multilingual-embedding models returns model embeddings for text inputs which support over 100 languages. - Default to "text-embedding-004". + If no setting is provided, "text-embedding-004" will be used by + default and a warning will be issued. session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): @@ -648,14 +127,20 @@ class TextEmbeddingGenerator(base.RetriableRemotePredictor): def __init__( self, *, - model_name: Literal[ - "text-embedding-005", - "text-embedding-004", - "text-multilingual-embedding-002", - ] = "text-embedding-004", + model_name: Optional[ + Literal[ + "text-embedding-005", + "text-embedding-004", + "text-multilingual-embedding-002", + ] + ] = None, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): + if model_name is None: + model_name = "text-embedding-004" + msg = exceptions.format_message(_REMOVE_DEFAULT_MODEL_WARNING) + warnings.warn(msg, category=FutureWarning, stacklevel=2) self.model_name = model_name self.session = session or global_session.get_global_session() self.connection_name = connection_name @@ -780,7 +265,8 @@ class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): Args: model_name (str, Default to "multimodalembedding@001"): The model for multimodal embedding. Can set to "multimodalembedding@001". Multimodal-embedding models returns model embeddings for text, image and video inputs. - Default to "multimodalembedding@001". + If no setting is provided, "multimodalembedding@001" will be used by + default and a warning will be issued. session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): @@ -791,12 +277,16 @@ class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): def __init__( self, *, - model_name: Literal["multimodalembedding@001"] = "multimodalembedding@001", + model_name: Optional[Literal["multimodalembedding@001"]] = None, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): if not bigframes.options.experiments.blob: raise NotImplementedError() + if model_name is None: + model_name = "multimodalembedding@001" + msg = exceptions.format_message(_REMOVE_DEFAULT_MODEL_WARNING) + warnings.warn(msg, category=FutureWarning, stacklevel=2) self.model_name = model_name self.session = session or global_session.get_global_session() self.connection_name = connection_name @@ -918,23 +408,24 @@ def to_gbq( return new_model.session.read_gbq_model(model_name) -@typing_extensions.deprecated( - "gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", - category=exceptions.ApiDeprecationWarning, -) @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. .. note:: - gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. + gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. Args: - model_name (str, Default to "gemini-pro"): - The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002" and "gemini-2.0-flash-exp". Default to "gemini-pro". + model_name (str, Default to "gemini-2.0-flash-001"): + The model for natural language tasks. Accepted values are + "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", + "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001". + If no setting is provided, "gemini-2.0-flash-001" will be used by + default and a warning will be issued. .. note:: - "gemini-pro" is going to be deprecated. Bigframes 2 will transition to using gemini-2.0-X. "gemini-2.0-flash-exp", "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions @@ -953,16 +444,19 @@ class GeminiTextGenerator(base.RetriableRemotePredictor): def __init__( self, *, - model_name: Literal[ - "gemini-pro", - "gemini-1.5-pro-preview-0514", - "gemini-1.5-flash-preview-0514", - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", - "gemini-2.0-flash-exp", - ] = "gemini-pro", + model_name: Optional[ + Literal[ + "gemini-1.5-pro-preview-0514", + "gemini-1.5-flash-preview-0514", + "gemini-1.5-pro-001", + "gemini-1.5-pro-002", + "gemini-1.5-flash-001", + "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", + ] + ] = None, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, max_iterations: int = 300, @@ -977,6 +471,10 @@ def __init__( "(https://cloud.google.com/products#product-launch-stages)." ) warnings.warn(msg, category=exceptions.PreviewWarning) + if model_name is None: + model_name = "gemini-2.0-flash-001" + msg = exceptions.format_message(_REMOVE_DEFAULT_MODEL_WARNING) + warnings.warn(msg, category=FutureWarning, stacklevel=2) self.model_name = model_name self.session = session or global_session.get_global_session() self.max_iterations = max_iterations @@ -1052,8 +550,8 @@ def fit( X: utils.ArrayType, y: utils.ArrayType, ) -> GeminiTextGenerator: - """Fine tune GeminiTextGenerator model. Only support "gemini-pro", "gemini-1.5-pro-002", - "gemini-1.5-flash-002" models for now. + """Fine tune GeminiTextGenerator model. Only support "gemini-1.5-pro-002", + and "gemini-1.5-flash-002" models for now. .. note:: @@ -1073,16 +571,13 @@ def fit( """ if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: raise NotImplementedError( - "fit() only supports gemini-pro, \ - gemini-1.5-pro-002, or gemini-1.5-flash-002 model." + "fit() only supports gemini-1.5-pro-002, or gemini-1.5-flash-002 model." ) X, y = utils.batch_convert_to_dataframe(X, y) options = self._bqml_options - options["endpoint"] = ( - "gemini-1.0-pro-002" if self.model_name == "gemini-pro" else self.model_name - ) + options["endpoint"] = self.model_name options["prompt_col"] = X.columns.tolist()[0] self._bqml_model = self._bqml_model_factory.create_llm_remote_model( @@ -1231,7 +726,8 @@ def score( "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", ) -> bigframes.dataframe.DataFrame: - """Calculate evaluation metrics of the model. Only support "gemini-pro" and "gemini-1.5-pro-002", and "gemini-1.5-flash-002". + """Calculate evaluation metrics of the model. Only support + "gemini-1.5-pro-002", and "gemini-1.5-flash-002". .. note:: @@ -1265,8 +761,7 @@ def score( if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: raise NotImplementedError( - "score() only supports gemini-pro \ - , gemini-1.5-pro-002, and gemini-1.5-flash-2 model." + "score() only supports gemini-1.5-pro-002, and gemini-1.5-flash-2 model." ) X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session) @@ -1329,7 +824,8 @@ class Claude3TextGenerator(base.RetriableRemotePredictor): "claude-3-5-sonnet" is Anthropic's most powerful AI model and maintains the speed and cost of Claude 3 Sonnet, which is a mid-tier model. "claude-3-opus" is Anthropic's second-most powerful AI model, with strong performance on highly complex tasks. https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#available-claude-models - Default to "claude-3-sonnet". + If no setting is provided, "claude-3-sonnet" will be used by default + and a warning will be issued. session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): @@ -1341,12 +837,21 @@ class Claude3TextGenerator(base.RetriableRemotePredictor): def __init__( self, *, - model_name: Literal[ - "claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet", "claude-3-opus" - ] = "claude-3-sonnet", + model_name: Optional[ + Literal[ + "claude-3-sonnet", + "claude-3-haiku", + "claude-3-5-sonnet", + "claude-3-opus", + ] + ] = None, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): + if model_name is None: + model_name = "claude-3-sonnet" + msg = exceptions.format_message(_REMOVE_DEFAULT_MODEL_WARNING) + warnings.warn(msg, category=FutureWarning, stacklevel=2) self.model_name = model_name self.session = session or global_session.get_global_session() self.connection_name = connection_name diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index eef72584bc..83c665a50b 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -42,6 +42,7 @@ "LINEAR_REGRESSION": linear_model.LinearRegression, "LOGISTIC_REGRESSION": linear_model.LogisticRegression, "KMEANS": cluster.KMeans, + "MATRIX_FACTORIZATION": decomposition.MatrixFactorization, "PCA": decomposition.PCA, "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor, "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier, @@ -56,11 +57,6 @@ _BQML_ENDPOINT_TYPE_MAPPING = MappingProxyType( { - llm._TEXT_GENERATOR_BISON_ENDPOINT: llm.PaLM2TextGenerator, - llm._TEXT_GENERATOR_BISON_32K_ENDPOINT: llm.PaLM2TextGenerator, - llm._EMBEDDING_GENERATOR_GECKO_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, - llm._EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, - llm._GEMINI_PRO_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_001_ENDPOINT: llm.GeminiTextGenerator, @@ -68,6 +64,8 @@ llm._GEMINI_1P5_FLASH_001_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_FLASH_002_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_2_FLASH_EXP_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_2_FLASH_001_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_2_FLASH_LITE_001_ENDPOINT: llm.GeminiTextGenerator, llm._CLAUDE_3_HAIKU_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator, @@ -83,6 +81,7 @@ def from_bq( session: bigframes.session.Session, bq_model: bigquery.Model ) -> Union[ + decomposition.MatrixFactorization, decomposition.PCA, cluster.KMeans, linear_model.LinearRegression, @@ -95,8 +94,6 @@ def from_bq( imported.TensorFlowModel, imported.ONNXModel, imported.XGBoostModel, - llm.PaLM2TextGenerator, - llm.PaLM2TextEmbeddingGenerator, llm.Claude3TextGenerator, llm.TextEmbeddingGenerator, llm.MultimodalEmbeddingGenerator, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index e89f17bcaa..a756fac3b9 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -299,6 +299,11 @@ def alter_model( return "\n".join(parts) # ML prediction TVFs + def ml_recommend(self, source_sql: str) -> str: + """Encode ML.RECOMMEND for BQML""" + return f"""SELECT * FROM ML.RECOMMEND(MODEL {self._model_ref_sql()}, + ({source_sql}))""" + def ml_predict(self, source_sql: str) -> str: """Encode ML.PREDICT for BQML""" return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()}, diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index d25791d3e4..e3f15e67a1 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -340,15 +340,17 @@ class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? bins: typing.Union[int, Iterable] right: Optional[bool] - labels: Optional[bool] + labels: typing.Union[bool, Iterable[str], None] @property def skips_nulls(self): return False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if isinstance(self.bins, int) and (self.labels is False): + if self.labels is False: return dtypes.INT_DTYPE + elif isinstance(self.labels, Iterable): + return dtypes.STRING_DTYPE else: # Assumption: buckets use same numeric type if isinstance(self.bins, int): diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 0ff92187cf..9d73fd43c1 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -46,7 +46,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) >>> df.ai.filter("{city} is the capital of {country}", model) @@ -160,7 +160,7 @@ def map( >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) @@ -283,7 +283,7 @@ def join( >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) @@ -525,7 +525,7 @@ def top_k( >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame( ... { diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index b4fae68a4f..54078557ed 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -297,16 +297,16 @@ def _resolve_connection(self, connection: Optional[str] = None) -> str: ValueError: If the connection cannot be resolved to a valid string. """ connection = connection or self._block.session._bq_connection - return clients.resolve_full_bq_connection_name( + return clients.get_canonical_bq_connection_id( connection, default_project=self._block.session._project, default_location=self._block.session._location, ) - def _get_runtime_json_str( - self, mode: str = "R", with_metadata: bool = False + def get_runtime_json_str( + self, mode: str = "R", *, with_metadata: bool = False ) -> bigframes.series.Series: - """Get the runtime and apply the ToJSONSTring transformation. + """Get the runtime (contains signed URL to access gcs data) and apply the ToJSONSTring transformation. .. note:: BigFrames Blob is still under experiments. It may not work and @@ -317,7 +317,7 @@ def _get_runtime_json_str( Default to "R". Possible values are "R" (read-only) and "RW" (read-write) with_metadata (bool, default False): whether to include metadata - in the JOSN string. Default to False. + in the JSON string. Default to False. Returns: str: the runtime object in the JSON string. @@ -325,13 +325,6 @@ def _get_runtime_json_str( runtime = self._get_runtime(mode=mode, with_metadata=with_metadata) return runtime._apply_unary_op(ops.ToJSONString()) - # TODO(b/404605969): remove cleanups when UDF fixes dataset deletion. - def _add_to_cleanup_set(self, udf): - """Add udf name to session cleanup set. Won't need this after UDF fixes dataset deletion.""" - self.session._function_session._update_temp_artifacts( - udf.bigframes_bigquery_function, "" - ) - def image_blur( self, ksize: tuple[int, int], @@ -365,7 +358,7 @@ def image_blur( import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) - df = self._get_runtime_json_str(mode="R").to_frame() + df = self.get_runtime_json_str(mode="R").to_frame() if dst is None: ext = self.uri().str.extract(FILE_EXT_REGEX) @@ -404,7 +397,7 @@ def image_blur( container_memory=container_memory, ).udf() - dst_rt = dst.blob._get_runtime_json_str(mode="RW") + dst_rt = dst.blob.get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") df["ksize_x"], df["ksize_y"] = ksize @@ -413,8 +406,6 @@ def image_blur( res = self._df_apply_udf(df, image_blur_udf) res.cache() # to execute the udf - self._add_to_cleanup_set(image_blur_udf) - return dst def image_resize( @@ -461,7 +452,7 @@ def image_resize( import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) - df = self._get_runtime_json_str(mode="R").to_frame() + df = self.get_runtime_json_str(mode="R").to_frame() if dst is None: ext = self.uri().str.extract(FILE_EXT_REGEX) @@ -501,7 +492,7 @@ def image_resize( container_memory=container_memory, ).udf() - dst_rt = dst.blob._get_runtime_json_str(mode="RW") + dst_rt = dst.blob.get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") df["dsize_x"], df["dsizye_y"] = dsize @@ -511,8 +502,6 @@ def image_resize( res = self._df_apply_udf(df, image_resize_udf) res.cache() # to execute the udf - self._add_to_cleanup_set(image_resize_udf) - return dst def image_normalize( @@ -552,7 +541,7 @@ def image_normalize( import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) - df = self._get_runtime_json_str(mode="R").to_frame() + df = self.get_runtime_json_str(mode="R").to_frame() if dst is None: ext = self.uri().str.extract(FILE_EXT_REGEX) @@ -593,7 +582,7 @@ def image_normalize( container_memory=container_memory, ).udf() - dst_rt = dst.blob._get_runtime_json_str(mode="RW") + dst_rt = dst.blob.get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") df["alpha"] = alpha @@ -604,8 +593,6 @@ def image_normalize( res = self._df_apply_udf(df, image_normalize_udf) res.cache() # to execute the udf - self._add_to_cleanup_set(image_normalize_udf) - return dst def pdf_extract( @@ -657,13 +644,12 @@ def pdf_extract( container_memory=container_memory, ).udf() - src_rt = self._get_runtime_json_str(mode="R") + src_rt = self.get_runtime_json_str(mode="R") res = src_rt.apply(pdf_extract_udf) content_series = res._apply_unary_op(ops.JSONValue(json_path="$.content")) - self._add_to_cleanup_set(pdf_extract_udf) if verbose: status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) res_df = bpd.DataFrame({"status": status_series, "content": content_series}) @@ -736,7 +722,7 @@ def pdf_chunk( container_memory=container_memory, ).udf() - src_rt = self._get_runtime_json_str(mode="R") + src_rt = self.get_runtime_json_str(mode="R") df = src_rt.to_frame() df["chunk_size"] = chunk_size df["overlap_size"] = overlap_size @@ -744,7 +730,6 @@ def pdf_chunk( res = self._df_apply_udf(df, pdf_chunk_udf) content_series = bbq.json_extract_string_array(res, "$.content") - self._add_to_cleanup_set(pdf_chunk_udf) if verbose: status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) res_df = bpd.DataFrame({"status": status_series, "content": content_series}) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index d1089f993e..8c5c54e8ca 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -57,7 +57,7 @@ def agg( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame( ... { @@ -326,7 +326,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) >>> df.semantics.filter("{city} is the capital of {country}", model) @@ -440,7 +440,7 @@ def map( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) @@ -563,7 +563,7 @@ def join( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) @@ -805,7 +805,7 @@ def top_k( >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame( ... { diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 46d4344499..784af8418d 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -305,9 +305,7 @@ def to_blob(self, connection: Optional[str] = None) -> series.Series: raise NotImplementedError() session = self._block.session - connection = session._create_bq_connection( - connection=connection, iam_role="storage.objectUser" - ) + connection = session._create_bq_connection(connection=connection) return self._apply_binary_op(connection, ops.obj_make_ref_op) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 730c287e1f..8e1e03e024 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -65,14 +65,19 @@ def remote_function( + # Make sure that the input/output types, and dataset can be used + # positionally. This avoids the worst of the breaking change from 1.x to + # 2.x while still preventing possible mixups between consecutive str + # parameters. input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, dataset: Optional[str] = None, + *, bigquery_connection: Optional[str] = None, reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, + cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, @@ -80,9 +85,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Optional[ - Literal["all", "internal-only", "internal-and-gclb"] - ] = None, + cloud_function_ingress_settings: Literal[ + "all", "internal-only", "internal-and-gclb" + ] = "internal-only", ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -112,9 +117,9 @@ def udf( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - dataset: Optional[str] = None, + dataset: str, bigquery_connection: Optional[str] = None, - name: Optional[str] = None, + name: str, packages: Optional[Sequence[str]] = None, ): return global_session.with_default_session( diff --git a/bigframes/series.py b/bigframes/series.py index be87129929..87f1f1d141 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -23,7 +23,19 @@ import numbers import textwrap import typing -from typing import Any, cast, List, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import ( + Any, + cast, + Iterable, + List, + Literal, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) +import warnings import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -46,9 +58,11 @@ import bigframes.core.utils as utils import bigframes.core.validations as validations import bigframes.core.window +from bigframes.core.window import rolling import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -385,19 +399,62 @@ def to_pandas( ) -> pandas.Series: """Writes Series to pandas Series. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([4, 3, 2]) + + Download the data from BigQuery and convert it into an in-memory pandas Series. + + >>> s.to_pandas() + 0 4 + 1 3 + 2 2 + dtype: Int64 + + Estimate job statistics without processing or downloading data by using `dry_run=True`. + + >>> s.to_pandas(dry_run=True) # doctest: +SKIP + columnCount 1 + columnDtypes {None: Int64} + indexLevel 1 + indexDtypes [Int64] + projectId bigframes-dev + location US + jobType QUERY + destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_... + useLegacySql False + referencedTables None + totalBytesProcessed 0 + cacheHit False + statementType SELECT + creationTime 2025-04-03 18:54:59.219000+00:00 + dtype: object + Args: max_download_size (int, default None): - Download size threshold in MB. If max_download_size is exceeded when downloading data - (e.g., to_pandas()), the data will be downsampled if - bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be - raised. If set to a value other than None, this will supersede the global config. + .. deprecated:: 2.0.0 + ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()`` + method instead. + + Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data, + the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is + ``True``, otherwise, an error will be raised. If set to a value other than ``None``, + this will supersede the global config. sampling_method (str, default None): + .. deprecated:: 2.0.0 + ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead. + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal computations to perform the downsampling; "uniform": This algorithm returns uniform random samples of the data. If set to a value other than None, this will supersede the global config. random_state (int, default None): + .. deprecated:: 2.0.0 + ``random_state`` parameter is deprecated. Please use ``sample()`` method instead. + The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. @@ -416,6 +473,19 @@ def to_pandas( is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run is set to True, a pandas Series containing dry run statistics will be returned. """ + if max_download_size is not None: + msg = bfe.format_message( + "DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` " + "are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`." + ) + warnings.warn(msg, category=FutureWarning) + if sampling_method is not None or random_state is not None: + msg = bfe.format_message( + "DEPRECATED: The `sampling_method` and `random_state` parameters for " + "`Series.to_pandas()` are deprecated and will be removed soon. " + "Please use `Series.sample().to_pandas()` instead for sampling." + ) + warnings.warn(msg, category=FutureWarning) if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( @@ -445,6 +515,70 @@ def to_pandas( series.name = self._name return series + def to_pandas_batches( + self, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + *, + allow_large_results: Optional[bool] = None, + ) -> Iterable[pandas.Series]: + """Stream Series results to an iterable of pandas Series. + + page_size and max_results determine the size and number of batches, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([4, 3, 2, 2, 3]) + + Iterate through the results in batches, limiting the total rows yielded + across all batches via `max_results`: + + >>> for s_batch in s.to_pandas_batches(max_results=3): + ... print(s_batch) + 0 4 + 1 3 + 2 2 + dtype: Int64 + + Alternatively, control the approximate size of each batch using `page_size` + and fetch batches manually using `next()`: + + >>> it = s.to_pandas_batches(page_size=2) + >>> next(it) + 0 4 + 1 3 + dtype: Int64 + >>> next(it) + 2 2 + 3 2 + dtype: Int64 + + Args: + page_size (int, default None): + The maximum number of rows of each batch. Non-positive values are ignored. + max_results (int, default None): + The maximum total number of rows of all batches. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. + + Returns: + Iterable[pandas.Series]: + An iterable of smaller Series which combine to + form the original Series. Results stream from bigquery, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable + """ + df = self._block.to_pandas_batches( + page_size=page_size, + max_results=max_results, + allow_large_results=allow_large_results, + squeeze=True, + ) + return df + def _compute_dry_run(self) -> bigquery.QueryJob: _, query_job = self._block._compute_dry_run((self._value_column,)) return query_job @@ -1378,7 +1512,9 @@ def _apply_aggregation( ) -> Any: return self._block.get_stat(self._value_column, op) - def _apply_window_op(self, op: agg_ops.WindowOp, window_spec: windows.WindowSpec): + def _apply_window_op( + self, op: agg_ops.UnaryWindowOp, window_spec: windows.WindowSpec + ): block = self._block block, result_id = block.apply_window_op( self._value_column, op, window_spec=window_spec, result_label=self.name @@ -1439,16 +1575,26 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: @validations.requires_ordering() def rolling( self, - window: int, - min_periods=None, + window: int | pandas.Timedelta | numpy.timedelta64 | datetime.timedelta | str, + min_periods: int | None = None, closed: Literal["right", "left", "both", "neither"] = "right", ) -> bigframes.core.window.Window: - window_spec = windows.WindowSpec( - bounds=windows.RowsWindowBounds.from_window_size(window, closed), - min_periods=min_periods if min_periods is not None else window, - ) - return bigframes.core.window.Window( - self._block, window_spec, self._block.value_columns, is_series=True + if isinstance(window, int): + # Rows rolling + window_spec = windows.WindowSpec( + bounds=windows.RowsWindowBounds.from_window_size(window, closed), + min_periods=window if min_periods is None else min_periods, + ) + return bigframes.core.window.Window( + self._block, window_spec, self._block.value_columns, is_series=True + ) + + return rolling.create_range_window( + block=self._block, + window=window, + min_periods=min_periods, + closed=closed, + is_series=True, ) @validations.requires_ordering() diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 3ac9b75039..9d45019fc5 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -53,30 +53,23 @@ ReadPickleBuffer, StorageOptions, ) -import pyarrow as pa from bigframes import exceptions as bfe from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients -import bigframes.core.blocks as blocks -import bigframes.core.compile -import bigframes.core.guid -import bigframes.core.pruning +from bigframes.core import blocks # Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. -import bigframes.dataframe -import bigframes.dtypes import bigframes.functions._function_session as bff_session import bigframes.functions.function as bff +from bigframes.session import bigquery_session, bq_caching_executor, executor import bigframes.session._io.bigquery as bf_io_bigquery +import bigframes.session.anonymous_dataset import bigframes.session.clients -import bigframes.session.executor import bigframes.session.loader import bigframes.session.metrics -import bigframes.session.planner -import bigframes.session.temp_storage import bigframes.session.validation # Avoid circular imports. @@ -107,22 +100,6 @@ logger = logging.getLogger(__name__) -# Excludes geography and nested (array, struct) datatypes -INLINABLE_DTYPES: Sequence[bigframes.dtypes.Dtype] = ( - pandas.BooleanDtype(), - pandas.Float64Dtype(), - pandas.Int64Dtype(), - pandas.StringDtype(storage="pyarrow"), - pandas.ArrowDtype(pa.binary()), - pandas.ArrowDtype(pa.date32()), - pandas.ArrowDtype(pa.time64("us")), - pandas.ArrowDtype(pa.timestamp("us")), - pandas.ArrowDtype(pa.timestamp("us", tz="UTC")), - pandas.ArrowDtype(pa.decimal128(38, 9)), - pandas.ArrowDtype(pa.decimal256(76, 38)), - pandas.ArrowDtype(pa.duration("us")), -) - class Session( third_party_pandas_gbq.GBQIOMixin, @@ -247,22 +224,32 @@ def __init__( self._metrics = bigframes.session.metrics.ExecutionMetrics() self._function_session = bff_session.FunctionSession() - self._temp_storage_manager = ( - bigframes.session.temp_storage.AnonymousDatasetManager( + self._anon_dataset_manager = ( + bigframes.session.anonymous_dataset.AnonymousDatasetManager( self._clients_provider.bqclient, location=self._location, session_id=self._session_id, kms_key=self._bq_kms_key_name, ) ) - self._executor: bigframes.session.executor.Executor = ( - bigframes.session.executor.BigQueryCachingExecutor( - bqclient=self._clients_provider.bqclient, - bqstoragereadclient=self._clients_provider.bqstoragereadclient, - storage_manager=self._temp_storage_manager, - strictly_ordered=self._strictly_ordered, - metrics=self._metrics, + # Session temp tables don't support specifying kms key, so use anon dataset if kms key specified + self._session_resource_manager = ( + bigquery_session.SessionResourceManager( + self.bqclient, + self._location, ) + if (self._bq_kms_key_name is None) + else None + ) + self._temp_storage_manager = ( + self._session_resource_manager or self._anon_dataset_manager + ) + self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( + bqclient=self._clients_provider.bqclient, + bqstoragereadclient=self._clients_provider.bqstoragereadclient, + storage_manager=self._temp_storage_manager, + strictly_ordered=self._strictly_ordered, + metrics=self._metrics, ) self._loader = bigframes.session.loader.GbqDataLoader( session=self, @@ -375,7 +362,7 @@ def _allows_ambiguity(self) -> bool: @property def _anonymous_dataset(self): - return self._temp_storage_manager.dataset + return self._anon_dataset_manager.dataset def __hash__(self): # Stable hash needed to use in expression tree @@ -388,9 +375,11 @@ def close(self): # Protect against failure when the Session is a fake for testing or # failed to initialize. - temp_storage_manager = getattr(self, "_temp_storage_manager", None) - if temp_storage_manager: - self._temp_storage_manager.clean_up_tables() + if anon_dataset_manager := getattr(self, "_anon_dataset_manager", None): + anon_dataset_manager.close() + + if session_resource_manager := getattr(self, "_session_resource_manager", None): + session_resource_manager.close() remote_function_session = getattr(self, "_function_session", None) if remote_function_session: @@ -793,19 +782,29 @@ def _read_pandas( "bigframes.pandas.DataFrame." ) + mem_usage = pandas_dataframe.memory_usage(deep=True).sum() if write_engine == "default": - try: - inline_df = self._read_pandas_inline(pandas_dataframe) - return inline_df - except ValueError: - pass - return self._read_pandas_load_job(pandas_dataframe, api_name) - elif write_engine == "bigquery_inline": + write_engine = ( + "bigquery_load" + if mem_usage > MAX_INLINE_DF_BYTES + else "bigquery_inline" + ) + + if write_engine == "bigquery_inline": + if mem_usage > MAX_INLINE_DF_BYTES: + raise ValueError( + f"DataFrame size ({mem_usage} bytes) exceeds the maximum allowed " + f"for inline data ({MAX_INLINE_DF_BYTES} bytes)." + ) return self._read_pandas_inline(pandas_dataframe) elif write_engine == "bigquery_load": - return self._read_pandas_load_job(pandas_dataframe, api_name) + return self._loader.read_pandas( + pandas_dataframe, method="load", api_name=api_name + ) elif write_engine == "bigquery_streaming": - return self._read_pandas_streaming(pandas_dataframe) + return self._loader.read_pandas( + pandas_dataframe, method="stream", api_name=api_name + ) else: raise ValueError(f"Got unexpected write_engine '{write_engine}'") @@ -814,56 +813,8 @@ def _read_pandas_inline( ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe - memory_usage = pandas_dataframe.memory_usage(deep=True).sum() - if memory_usage > MAX_INLINE_DF_BYTES: - raise ValueError( - f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed " - f"for inline data ({MAX_INLINE_DF_BYTES} bytes)." - ) - - try: - local_block = blocks.Block.from_local(pandas_dataframe, self) - inline_df = dataframe.DataFrame(local_block) - except ( - pa.ArrowInvalid, # Thrown by arrow for unsupported types, such as geo. - pa.ArrowTypeError, # Thrown by arrow for types without mapping (geo). - ValueError, # Thrown by ibis for some unhandled types - TypeError, # Not all types handleable by local code path - ) as exc: - raise ValueError( - f"Could not convert with a BigQuery type: `{exc}`. " - ) from exc - - # Make sure all types are inlinable to avoid escaping errors. - inline_types = inline_df._block.expr.schema.dtypes - noninlinable_types = [ - dtype for dtype in inline_types if dtype not in INLINABLE_DTYPES - ] - if len(noninlinable_types) != 0: - raise ValueError( - f"Could not inline with a BigQuery type: `{noninlinable_types}`. " - f"{constants.FEEDBACK_LINK}" - ) - - return inline_df - - def _read_pandas_load_job( - self, - pandas_dataframe: pandas.DataFrame, - api_name: str, - ) -> dataframe.DataFrame: - try: - return self._loader.read_pandas_load_job(pandas_dataframe, api_name) - except (pa.ArrowInvalid, pa.ArrowTypeError) as exc: - raise ValueError( - f"Could not convert with a BigQuery type: `{exc}`." - ) from exc - - def _read_pandas_streaming( - self, - pandas_dataframe: pandas.DataFrame, - ) -> dataframe.DataFrame: - return self._loader.read_pandas_streaming(pandas_dataframe) + local_block = blocks.Block.from_local(pandas_dataframe, self) + return dataframe.DataFrame(local_block) def read_csv( self, @@ -906,117 +857,183 @@ def read_csv( engine=engine, write_engine=write_engine, ) - table = self._temp_storage_manager.allocate_temp_table() - - if engine is not None and engine == "bigquery": - if any(param is not None for param in (dtype, names)): - not_supported = ("dtype", "names") - raise NotImplementedError( - f"BigQuery engine does not support these arguments: {not_supported}. " - f"{constants.FEEDBACK_LINK}" - ) - # TODO(b/338089659): Looks like we can relax this 1 column - # restriction if we check the contents of an iterable are strings - # not integers. - if ( - # Empty tuples, None, and False are allowed and falsey. - index_col - and not isinstance(index_col, bigframes.enums.DefaultIndexKind) - and not isinstance(index_col, str) - ): - raise NotImplementedError( - "BigQuery engine only supports a single column name for `index_col`, " - f"got: {repr(index_col)}. {constants.FEEDBACK_LINK}" - ) + if engine != "bigquery": + # Using pandas.read_csv by default and warning about potential issues with + # large files. + return self._read_csv_w_pandas_engines( + filepath_or_buffer, + sep=sep, + header=header, + names=names, + index_col=index_col, + usecols=usecols, # type: ignore + dtype=dtype, + engine=engine, + encoding=encoding, + write_engine=write_engine, + **kwargs, + ) + else: + return self._read_csv_w_bigquery_engine( + filepath_or_buffer, + sep=sep, + header=header, + names=names, + index_col=index_col, + usecols=usecols, # type: ignore + dtype=dtype, + encoding=encoding, + ) - # None and False cannot be passed to read_gbq. - # TODO(b/338400133): When index_col is None, we should be using the - # first column of the CSV as the index to be compatible with the - # pandas engine. According to the pandas docs, only "False" - # indicates a default sequential index. - if not index_col: - index_col = () + def _read_csv_w_pandas_engines( + self, + filepath_or_buffer, + *, + sep, + header, + names, + index_col, + usecols, + dtype, + engine, + encoding, + write_engine, + **kwargs, + ) -> dataframe.DataFrame: + """Reads a CSV file using pandas engines into a BigQuery DataFrames. - index_col = typing.cast( - Union[ - Sequence[str], # Falsey values - bigframes.enums.DefaultIndexKind, - str, - ], - index_col, + This method serves as the implementation backend for read_csv when the + specified engine is one supported directly by pandas ('c', 'python', + 'pyarrow'). + """ + if isinstance(index_col, bigframes.enums.DefaultIndexKind): + raise NotImplementedError( + f"With index_col={repr(index_col)}, only engine='bigquery' is supported. " + f"{constants.FEEDBACK_LINK}" + ) + if any(arg in kwargs for arg in ("chunksize", "iterator")): + raise NotImplementedError( + "'chunksize' and 'iterator' arguments are not supported. " + f"{constants.FEEDBACK_LINK}" ) + if isinstance(filepath_or_buffer, str): + self._check_file_size(filepath_or_buffer) - # usecols should only be an iterable of strings (column names) for use as columns in read_gbq. - columns: Tuple[Any, ...] = tuple() - if usecols is not None: - if isinstance(usecols, Iterable) and all( - isinstance(col, str) for col in usecols - ): - columns = tuple(col for col in usecols) - else: - raise NotImplementedError( - "BigQuery engine only supports an iterable of strings for `usecols`. " - f"{constants.FEEDBACK_LINK}" - ) + pandas_df = pandas.read_csv( + filepath_or_buffer, + sep=sep, + header=header, + names=names, + index_col=index_col, + usecols=usecols, # type: ignore + dtype=dtype, + engine=engine, + encoding=encoding, + **kwargs, + ) + return self._read_pandas(pandas_df, api_name="read_csv", write_engine=write_engine) # type: ignore - if encoding is not None and encoding not in _VALID_ENCODINGS: - raise NotImplementedError( - f"BigQuery engine only supports the following encodings: {_VALID_ENCODINGS}. " - f"{constants.FEEDBACK_LINK}" - ) + def _read_csv_w_bigquery_engine( + self, + filepath_or_buffer, + *, + sep, + header, + names, + index_col, + usecols, + dtype, + encoding, + ) -> dataframe.DataFrame: + """Reads a CSV file using the BigQuery engine into a BigQuery DataFrames. - job_config = bigquery.LoadJobConfig() - job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED - job_config.source_format = bigquery.SourceFormat.CSV - job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY - job_config.autodetect = True - job_config.field_delimiter = sep - job_config.encoding = encoding - job_config.labels = {"bigframes-api": "read_csv"} + This method serves as the implementation backend for read_csv when the + 'bigquery' engine is specified or inferred. It leverages BigQuery's + native CSV loading capabilities, making it suitable for large datasets + that may not fit into local memory. + """ - # We want to match pandas behavior. If header is 0, no rows should be skipped, so we - # do not need to set `skip_leading_rows`. If header is None, then there is no header. - # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows. - if header is None: - job_config.skip_leading_rows = 0 - elif header > 0: - job_config.skip_leading_rows = header + if any(param is not None for param in (dtype, names)): + not_supported = ("dtype", "names") + raise NotImplementedError( + f"BigQuery engine does not support these arguments: {not_supported}. " + f"{constants.FEEDBACK_LINK}" + ) - return self._loader._read_bigquery_load_job( - filepath_or_buffer, - table, - job_config=job_config, - index_col=index_col, - columns=columns, + # TODO(b/338089659): Looks like we can relax this 1 column + # restriction if we check the contents of an iterable are strings + # not integers. + if ( + # Empty tuples, None, and False are allowed and falsey. + index_col + and not isinstance(index_col, bigframes.enums.DefaultIndexKind) + and not isinstance(index_col, str) + ): + raise NotImplementedError( + "BigQuery engine only supports a single column name for `index_col`, " + f"got: {repr(index_col)}. {constants.FEEDBACK_LINK}" ) - else: - if isinstance(index_col, bigframes.enums.DefaultIndexKind): - raise NotImplementedError( - f"With index_col={repr(index_col)}, only engine='bigquery' is supported. " - f"{constants.FEEDBACK_LINK}" - ) - if any(arg in kwargs for arg in ("chunksize", "iterator")): + + # None and False cannot be passed to read_gbq. + # TODO(b/338400133): When index_col is None, we should be using the + # first column of the CSV as the index to be compatible with the + # pandas engine. According to the pandas docs, only "False" + # indicates a default sequential index. + if not index_col: + index_col = () + + index_col = typing.cast( + Union[ + Sequence[str], # Falsey values + bigframes.enums.DefaultIndexKind, + str, + ], + index_col, + ) + + # usecols should only be an iterable of strings (column names) for use as columns in read_gbq. + columns: Tuple[Any, ...] = tuple() + if usecols is not None: + if isinstance(usecols, Iterable) and all( + isinstance(col, str) for col in usecols + ): + columns = tuple(col for col in usecols) + else: raise NotImplementedError( - "'chunksize' and 'iterator' arguments are not supported. " + "BigQuery engine only supports an iterable of strings for `usecols`. " f"{constants.FEEDBACK_LINK}" ) - if isinstance(filepath_or_buffer, str): - self._check_file_size(filepath_or_buffer) - pandas_df = pandas.read_csv( - filepath_or_buffer, - sep=sep, - header=header, - names=names, - index_col=index_col, - usecols=usecols, # type: ignore - dtype=dtype, - engine=engine, - encoding=encoding, - **kwargs, + if encoding is not None and encoding not in _VALID_ENCODINGS: + raise NotImplementedError( + f"BigQuery engine only supports the following encodings: {_VALID_ENCODINGS}. " + f"{constants.FEEDBACK_LINK}" ) - return self._read_pandas(pandas_df, api_name="read_csv", write_engine=write_engine) # type: ignore + + job_config = bigquery.LoadJobConfig() + job_config.source_format = bigquery.SourceFormat.CSV + job_config.autodetect = True + job_config.field_delimiter = sep + job_config.encoding = encoding + job_config.labels = {"bigframes-api": "read_csv"} + + # b/409070192: When header > 0, pandas and BigFrames returns different column naming. + + # We want to match pandas behavior. If header is 0, no rows should be skipped, so we + # do not need to set `skip_leading_rows`. If header is None, then there is no header. + # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows. + if header is None: + job_config.skip_leading_rows = 0 + elif header > 0: + job_config.skip_leading_rows = header + 1 + + return self._loader.read_bigquery_load_job( + filepath_or_buffer, + job_config=job_config, + index_col=index_col, + columns=columns, + ) def read_pickle( self, @@ -1052,18 +1069,12 @@ def read_parquet( engine=engine, write_engine=write_engine, ) - table = self._temp_storage_manager.allocate_temp_table() - if engine == "bigquery": job_config = bigquery.LoadJobConfig() - job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.PARQUET - job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY job_config.labels = {"bigframes-api": "read_parquet"} - return self._loader._read_bigquery_load_job( - path, table, job_config=job_config - ) + return self._loader.read_bigquery_load_job(path, job_config=job_config) else: if "*" in path: raise ValueError( @@ -1106,8 +1117,6 @@ def read_json( engine=engine, write_engine=write_engine, ) - table = self._temp_storage_manager.allocate_temp_table() - if engine == "bigquery": if dtype is not None: @@ -1131,16 +1140,13 @@ def read_json( ) job_config = bigquery.LoadJobConfig() - job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON - job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY job_config.autodetect = True job_config.encoding = encoding job_config.labels = {"bigframes-api": "read_json"} - return self._loader._read_bigquery_load_job( + return self._loader.read_bigquery_load_job( path_or_buf, - table, job_config=job_config, ) else: @@ -1202,14 +1208,19 @@ def _check_file_size(self, filepath: str): def remote_function( self, + # Make sure that the input/output types, and dataset can be used + # positionally. This avoids the worst of the breaking change from 1.x to + # 2.x while still preventing possible mixups between consecutive str + # parameters. input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, dataset: Optional[str] = None, + *, bigquery_connection: Optional[str] = None, reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, + cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, @@ -1217,9 +1228,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Optional[ - Literal["all", "internal-only", "internal-and-gclb"] - ] = None, + cloud_function_ingress_settings: Literal[ + "all", "internal-only", "internal-and-gclb" + ] = "internal-only", ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1327,8 +1338,8 @@ def remote_function( Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided + cloud_function_service_account (str): + Service account to use for the cloud functions. If "default" provided then the default service account would be used. See https://cloud.google.com/functions/docs/securing/function-identity for more details. Please make sure the service account has the @@ -1392,8 +1403,8 @@ def remote_function( cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the function. Options are: `all`, `internal-only`, or `internal-and-gclb`. - If no setting is provided, `all` will be used by default and a warning - will be issued. See for more details + If no setting is provided, `internal-only` will be used by default. + See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. Returns: collections.abc.Callable: @@ -1406,8 +1417,8 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ return self._function_session.remote_function( - input_types, - output_type, + input_types=input_types, + output_type=output_type, session=self, dataset=dataset, bigquery_connection=bigquery_connection, @@ -1430,9 +1441,9 @@ def udf( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - dataset: Optional[str] = None, + dataset: str, bigquery_connection: Optional[str] = None, - name: Optional[str] = None, + name: str, packages: Optional[Sequence[str]] = None, ): """Decorator to turn a Python user defined function (udf) into a @@ -1459,30 +1470,33 @@ def udf( be specified. The supported output types are `bool`, `bytes`, `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` and `list[str]`. - dataset (str, Optional): + dataset (str): Dataset in which to create a BigQuery managed function. It should be in `.` or `` - format. If this parameter is not provided then session dataset - id is used. + format. bigquery_connection (str, Optional): - Name of the BigQuery connection. You should either have the - connection already created in the `location` you have chosen, or - you should have the Project IAM Admin role to enable the service - to create the connection for you if you need it. If this - parameter is not provided then the BigQuery connection from the - session is used. - name (str, Optional): + Name of the BigQuery connection. It is used to provide an + identity to the serverless instances running the user code. It + helps BigQuery manage and track the resources used by the udf. + This connection is required for internet access and for + interacting with other GCP services. To access GCP services, the + appropriate IAM permissions must also be granted to the + connection's Service Account. When it defaults to None, the udf + will be created without any connection. A udf without a + connection has no internet access and no access to other GCP + services. + name (str): Explicit name of the persisted BigQuery managed function. Use it with caution, because more than one users working in the same project and dataset could overwrite each other's managed - functions if they use the same persistent name. When an explicit - name is provided, any session specific clean up ( + functions if they use the same persistent name. Please note that + any session specific clean up ( ``bigframes.session.Session.close``/ ``bigframes.pandas.close_session``/ ``bigframes.pandas.reset_session``/ ``bigframes.pandas.clean_up_by_session_id``) does not clean up - the function, and leaves it for the user to manage the function - and the associated cloud function directly. + this function, and leaves it for the user to manage the function + directly. packages (str[], Optional): Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be @@ -1499,8 +1513,8 @@ def udf( deployed for the user defined code. """ return self._function_session.udf( - input_types, - output_type, + input_types=input_types, + output_type=output_type, session=self, dataset=dataset, bigquery_connection=bigquery_connection, @@ -1593,7 +1607,7 @@ def read_gbq_function( Another use case is to define your own remote function and use it later. For example, define the remote function: - >>> @bpd.remote_function() + >>> @bpd.remote_function(cloud_function_service_account="default") ... def tenfold(num: int) -> float: ... return num * 10 @@ -1620,7 +1634,7 @@ def read_gbq_function( note, row processor implies that the function has only one input parameter. - >>> @bpd.remote_function() + >>> @bpd.remote_function(cloud_function_service_account="default") ... def row_sum(s: bpd.Series) -> float: ... return s['a'] + s['b'] + s['c'] @@ -1708,7 +1722,7 @@ def _start_query_ml_ddl( def _create_object_table(self, path: str, connection: str) -> str: """Create a random id Object Table from the input path and connection.""" - table = str(self._loader._storage_manager.generate_unique_resource_id()) + table = str(self._anon_dataset_manager.generate_unique_resource_id()) import textwrap @@ -1757,9 +1771,7 @@ def from_glob_path( raise NotImplementedError() # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done. - connection = self._create_bq_connection( - connection=connection, iam_role="storage.objectUser" - ) + connection = self._create_bq_connection(connection=connection) table = self._create_object_table(path, connection) @@ -1769,13 +1781,16 @@ def from_glob_path( return s.rename(name).to_frame() def _create_bq_connection( - self, iam_role: str, *, connection: Optional[str] = None + self, + *, + connection: Optional[str] = None, + iam_role: Optional[str] = None, ) -> str: """Create the connection with the session settings and try to attach iam role to the connection SA. If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name.""" connection = self._bq_connection if not connection else connection - connection = bigframes.clients.resolve_full_bq_connection_name( - connection_name=connection, + connection = bigframes.clients.get_canonical_bq_connection_id( + connection_id=connection, default_project=self._project, default_location=self._location, ) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index d9f1c0f295..4fdd836777 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -245,6 +245,8 @@ def start_query_with_client( location=location, project=project, api_timeout=timeout, + page_size=page_size, + max_results=max_results, ) if metrics is not None: metrics.count_job_stats(query=sql) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index ca70ee774c..9340e060ac 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -19,7 +19,6 @@ import bigframes_vendored.constants as constants import geopandas # type: ignore -import numpy as np import pandas import pandas.arrays import pyarrow # type: ignore @@ -27,7 +26,6 @@ import pyarrow.types # type: ignore import bigframes.core.schema -import bigframes.core.utils as utils import bigframes.dtypes import bigframes.features @@ -80,7 +78,10 @@ def arrow_to_pandas( if dtype == geopandas.array.GeometryDtype(): series = geopandas.GeoSeries.from_wkt( - column, + # Use `to_pylist()` is a workaround for TypeError: object of type + # 'pyarrow.lib.StringScalar' has no len() on older pyarrow, + # geopandas, shapely combinations. + column.to_pylist(), # BigQuery geography type is based on the WGS84 reference ellipsoid. crs="EPSG:4326", ) @@ -130,49 +131,3 @@ def arrow_to_pandas( serieses[field.name] = series return pandas.DataFrame(serieses) - - -def pandas_to_bq_compatible(pandas_dataframe: pandas.DataFrame) -> DataFrameAndLabels: - """Convert a pandas DataFrame into something compatible with uploading to a - BigQuery table (without flexible column names enabled). - """ - col_index = pandas_dataframe.columns.copy() - col_labels, idx_labels = ( - col_index.to_list(), - pandas_dataframe.index.names, - ) - new_col_ids, new_idx_ids = utils.get_standardized_ids( - col_labels, - idx_labels, - # Loading parquet files into BigQuery with special column names - # is only supported under an allowlist. - strict=True, - ) - - # Add order column to pandas DataFrame to preserve order in BigQuery - ordering_col = "rowid" - columns = frozenset(col_labels + idx_labels) - suffix = 2 - while ordering_col in columns: - ordering_col = f"rowid_{suffix}" - suffix += 1 - - pandas_dataframe_copy = pandas_dataframe.copy() - pandas_dataframe_copy.index.names = new_idx_ids - pandas_dataframe_copy.columns = pandas.Index(new_col_ids) - pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) - - timedelta_cols = utils.replace_timedeltas_with_micros(pandas_dataframe_copy) - json_cols = utils.replace_json_with_string(pandas_dataframe_copy) - col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - **{col: bigframes.dtypes.TIMEDELTA_DTYPE for col in timedelta_cols}, - **{col: bigframes.dtypes.JSON_DTYPE for col in json_cols}, - } - - return DataFrameAndLabels( - df=pandas_dataframe_copy, - column_labels=col_labels, - index_labels=idx_labels, - ordering_col=ordering_col, - col_type_overrides=col_type_overrides, - ) diff --git a/bigframes/session/temp_storage.py b/bigframes/session/anonymous_dataset.py similarity index 89% rename from bigframes/session/temp_storage.py rename to bigframes/session/anonymous_dataset.py index 3b2965efef..c5808aa63c 100644 --- a/bigframes/session/temp_storage.py +++ b/bigframes/session/anonymous_dataset.py @@ -18,13 +18,14 @@ import google.cloud.bigquery as bigquery -import bigframes.constants as constants +from bigframes import constants +from bigframes.session import temporary_storage import bigframes.session._io.bigquery as bf_io_bigquery _TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}" -class AnonymousDatasetManager: +class AnonymousDatasetManager(temporary_storage.TemporaryStorageManager): """ Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. """ @@ -38,10 +39,10 @@ def __init__( kms_key: Optional[str] = None ): self.bqclient = bqclient - self.location = location + self._location = location self.dataset = bf_io_bigquery.create_bq_dataset_reference( self.bqclient, - location=self.location, + location=self._location, api_name="session-__init__", ) @@ -49,8 +50,12 @@ def __init__( self._table_ids: List[bigquery.TableReference] = [] self._kms_key = kms_key - def allocate_and_create_temp_table( - self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] + @property + def location(self): + return self._location + + def create_temp_table( + self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] ) -> bigquery.TableReference: """ Allocates and and creates a table in the anonymous dataset. @@ -99,7 +104,8 @@ def generate_unique_resource_id(self) -> bigquery.TableReference: ) return self.dataset.table(table_id) - def clean_up_tables(self): + def close(self): """Delete tables that were created with this session's session_id.""" for table_ref in self._table_ids: self.bqclient.delete_table(table_ref, not_found_ok=True) + self._table_ids.clear() diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py new file mode 100644 index 0000000000..ae8dc88d43 --- /dev/null +++ b/bigframes/session/bigquery_session.py @@ -0,0 +1,168 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import logging +import threading +from typing import Callable, Optional, Sequence +import uuid + +# TODO: Non-ibis implementation +import bigframes_vendored.ibis.backends.bigquery.datatypes as ibis_bq +import google.cloud.bigquery as bigquery + +from bigframes.core.compile import googlesql +from bigframes.session import temporary_storage + +KEEPALIVE_QUERY_TIMEOUT_SECONDS = 5.0 + +KEEPALIVE_FREQUENCY = datetime.timedelta(hours=6) + + +logger = logging.getLogger(__name__) + + +class SessionResourceManager(temporary_storage.TemporaryStorageManager): + """ + Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. + """ + + def __init__(self, bqclient: bigquery.Client, location: str): + self.bqclient = bqclient + self._location = location + self._session_id: Optional[str] = None + self._sessiondaemon: Optional[RecurringTaskDaemon] = None + self._session_lock = threading.RLock() + + @property + def location(self): + return self._location + + def create_temp_table( + self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] + ) -> bigquery.TableReference: + """Create a temporary session table. Session is an exclusive resource, so throughput is limited""" + # Can't set a table in _SESSION as destination via query job API, so we + # run DDL, instead. + with self._session_lock: + table_ref = bigquery.TableReference( + bigquery.DatasetReference(self.bqclient.project, "_SESSION"), + f"bqdf_{uuid.uuid4()}", + ) + job_config = bigquery.QueryJobConfig( + connection_properties=[ + bigquery.ConnectionProperty("session_id", self._get_session_id()) + ] + ) + + ibis_schema = ibis_bq.BigQuerySchema.to_ibis(list(schema)) + + fields = [ + f"{googlesql.identifier(name)} {ibis_bq.BigQueryType.from_ibis(ibis_type)}" + for name, ibis_type in ibis_schema.fields.items() + ] + fields_string = ",".join(fields) + + cluster_string = "" + if cluster_cols: + cluster_cols_sql = ", ".join( + f"{googlesql.identifier(cluster_col)}" + for cluster_col in cluster_cols + ) + cluster_string = f"\nCLUSTER BY {cluster_cols_sql}" + + ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" + + job = self.bqclient.query(ddl, job_config=job_config) + job.result() + # return the fully qualified table, so it can be used outside of the session + return job.destination + + def close(self): + if self._sessiondaemon is not None: + self._sessiondaemon.stop() + + if self._session_id is not None and self.bqclient is not None: + self.bqclient.query_and_wait(f"CALL BQ.ABORT_SESSION('{self._session_id}')") + + def _get_session_id(self) -> str: + if self._session_id: + return self._session_id + with self._session_lock: + if self._session_id is None: + job_config = bigquery.QueryJobConfig(create_session=True) + # Make sure the session is a new one, not one associated with another query. + job_config.use_query_cache = False + query_job = self.bqclient.query( + "SELECT 1", job_config=job_config, location=self.location + ) + query_job.result() # blocks until finished + assert query_job.session_info is not None + assert query_job.session_info.session_id is not None + self._session_id = query_job.session_info.session_id + self._sessiondaemon = RecurringTaskDaemon( + task=self._keep_session_alive, frequency=KEEPALIVE_FREQUENCY + ) + self._sessiondaemon.start() + return query_job.session_info.session_id + else: + return self._session_id + + def _keep_session_alive(self): + # bq sessions will default expire after 24 hours of disuse, but if queried, this is renewed to a maximum of 7 days + with self._session_lock: + job_config = bigquery.QueryJobConfig( + connection_properties=[ + bigquery.ConnectionProperty("session_id", self._get_session_id()) + ] + ) + try: + self.bqclient.query_and_wait( + "SELECT 1", + location=self.location, + job_config=job_config, + wait_timeout=KEEPALIVE_QUERY_TIMEOUT_SECONDS, + ) + except Exception as e: + logging.warning("BigQuery session keep-alive query errored : %s", e) + + +class RecurringTaskDaemon: + def __init__(self, task: Callable[[], None], frequency: datetime.timedelta): + self._stop_event = threading.Event() + self._frequency = frequency + self._thread = threading.Thread(target=self._run_loop, daemon=True) + self._task = task + + def start(self): + """Start the daemon. Cannot be restarted once stopped.""" + if self._stop_event.is_set(): + raise RuntimeError("Cannot restart daemon thread.") + self._thread.start() + + def _run_loop(self): + while True: + self._stop_event.wait(self._frequency.total_seconds()) + if self._stop_event.is_set(): + return + try: + self._task() + except Exception as e: + logging.warning("RecurringTaskDaemon task errorred: %s", e) + + def stop(self, timeout_seconds: Optional[float] = None): + """Stop and cleanup the daemon.""" + if self._thread.is_alive(): + self._stop_event.set() + self._thread.join(timeout=timeout_seconds) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py new file mode 100644 index 0000000000..983b1918f5 --- /dev/null +++ b/bigframes/session/bq_caching_executor.py @@ -0,0 +1,598 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math +import os +from typing import cast, Literal, Mapping, Optional, Sequence, Tuple, Union +import warnings +import weakref + +import google.api_core.exceptions +from google.cloud import bigquery +import google.cloud.bigquery.job as bq_job +import google.cloud.bigquery.table as bq_table +import google.cloud.bigquery_storage_v1 + +import bigframes.core +import bigframes.core.compile +import bigframes.core.guid +import bigframes.core.nodes as nodes +import bigframes.core.ordering as order +import bigframes.core.tree_properties as tree_properties +import bigframes.dtypes +import bigframes.exceptions as bfe +import bigframes.features +from bigframes.session import executor, read_api_execution +import bigframes.session._io.bigquery as bq_io +import bigframes.session.metrics +import bigframes.session.planner +import bigframes.session.temporary_storage + +# Max complexity that should be executed as a single query +QUERY_COMPLEXITY_LIMIT = 1e7 +# Number of times to factor out subqueries before giving up. +MAX_SUBTREE_FACTORINGS = 5 +_MAX_CLUSTER_COLUMNS = 4 +MAX_SMALL_RESULT_BYTES = 10 * 1024 * 1024 * 1024 # 10G + + +class BigQueryCachingExecutor(executor.Executor): + """Computes BigFrames values using BigQuery Engine. + + This executor can cache expressions. If those expressions are executed later, this session + will re-use the pre-existing results from previous executions. + + This class is not thread-safe. + """ + + def __init__( + self, + bqclient: bigquery.Client, + storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager, + bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, + *, + strictly_ordered: bool = True, + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + ): + self.bqclient = bqclient + self.storage_manager = storage_manager + self.compiler: bigframes.core.compile.SQLCompiler = ( + bigframes.core.compile.SQLCompiler() + ) + self.strictly_ordered: bool = strictly_ordered + self._cached_executions: weakref.WeakKeyDictionary[ + nodes.BigFrameNode, nodes.BigFrameNode + ] = weakref.WeakKeyDictionary() + self.metrics = metrics + self.bqstoragereadclient = bqstoragereadclient + # Simple left-to-right precedence for now + self._semi_executors = ( + read_api_execution.ReadApiSemiExecutor( + bqstoragereadclient=bqstoragereadclient, + project=self.bqclient.project, + ), + ) + + def to_sql( + self, + array_value: bigframes.core.ArrayValue, + offset_column: Optional[str] = None, + ordered: bool = False, + enable_cache: bool = True, + ) -> str: + if offset_column: + array_value, _ = array_value.promote_offsets() + node = ( + self.replace_cached_subtrees(array_value.node) + if enable_cache + else array_value.node + ) + return self.compiler.compile(node, ordered=ordered) + + def execute( + self, + array_value: bigframes.core.ArrayValue, + *, + ordered: bool = True, + use_explicit_destination: Optional[bool] = None, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + ) -> executor.ExecuteResult: + if use_explicit_destination is None: + use_explicit_destination = bigframes.options.bigquery.allow_large_results + + if bigframes.options.compute.enable_multi_query_execution: + self._simplify_with_caching(array_value) + + plan = self.replace_cached_subtrees(array_value.node) + # Use explicit destination to avoid 10GB limit of temporary table + destination_table = ( + self.storage_manager.create_temp_table( + array_value.schema.to_bigquery(), cluster_cols=[] + ) + if use_explicit_destination + else None + ) + return self._execute_plan( + plan, + ordered=ordered, + page_size=page_size, + max_results=max_results, + destination=destination_table, + ) + + def export_gbq( + self, + array_value: bigframes.core.ArrayValue, + destination: bigquery.TableReference, + if_exists: Literal["fail", "replace", "append"] = "fail", + cluster_cols: Sequence[str] = [], + ): + """ + Export the ArrayValue to an existing BigQuery table. + """ + if bigframes.options.compute.enable_multi_query_execution: + self._simplify_with_caching(array_value) + + dispositions = { + "fail": bigquery.WriteDisposition.WRITE_EMPTY, + "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, + "append": bigquery.WriteDisposition.WRITE_APPEND, + } + sql = self.to_sql(array_value, ordered=False) + job_config = bigquery.QueryJobConfig( + write_disposition=dispositions[if_exists], + destination=destination, + clustering_fields=cluster_cols if cluster_cols else None, + ) + # TODO(swast): plumb through the api_name of the user-facing api that + # caused this query. + _, query_job = self._run_execute_query( + sql=sql, + job_config=job_config, + ) + + has_timedelta_col = any( + t == bigframes.dtypes.TIMEDELTA_DTYPE for t in array_value.schema.dtypes + ) + + if if_exists != "append" and has_timedelta_col: + # Only update schema if this is not modifying an existing table, and the + # new table contains timedelta columns. + table = self.bqclient.get_table(destination) + table.schema = array_value.schema.to_bigquery() + self.bqclient.update_table(table, ["schema"]) + + return query_job + + def export_gcs( + self, + array_value: bigframes.core.ArrayValue, + uri: str, + format: Literal["json", "csv", "parquet"], + export_options: Mapping[str, Union[bool, str]], + ): + query_job = self.execute( + array_value, + ordered=False, + use_explicit_destination=True, + ).query_job + assert query_job is not None + result_table = query_job.destination + assert result_table is not None + export_data_statement = bq_io.create_export_data_statement( + f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", + uri=uri, + format=format, + export_options=dict(export_options), + ) + + bq_io.start_query_with_client( + self.bqclient, + export_data_statement, + job_config=bigquery.QueryJobConfig(), + api_name=f"dataframe-to_{format.lower()}", + metrics=self.metrics, + ) + return query_job + + def dry_run( + self, array_value: bigframes.core.ArrayValue, ordered: bool = True + ) -> bigquery.QueryJob: + sql = self.to_sql(array_value, ordered=ordered) + job_config = bigquery.QueryJobConfig(dry_run=True) + query_job = self.bqclient.query(sql, job_config=job_config) + return query_job + + def peek( + self, + array_value: bigframes.core.ArrayValue, + n_rows: int, + use_explicit_destination: Optional[bool] = None, + ) -> executor.ExecuteResult: + """ + A 'peek' efficiently accesses a small number of rows in the dataframe. + """ + plan = self.replace_cached_subtrees(array_value.node) + if not tree_properties.can_fast_peek(plan): + msg = bfe.format_message("Peeking this value cannot be done efficiently.") + warnings.warn(msg) + if use_explicit_destination is None: + use_explicit_destination = bigframes.options.bigquery.allow_large_results + + destination_table = ( + self.storage_manager.create_temp_table( + array_value.schema.to_bigquery(), cluster_cols=[] + ) + if use_explicit_destination + else None + ) + + return self._execute_plan( + plan, ordered=False, destination=destination_table, peek=n_rows + ) + + def head( + self, array_value: bigframes.core.ArrayValue, n_rows: int + ) -> executor.ExecuteResult: + + maybe_row_count = self._local_get_row_count(array_value) + if (maybe_row_count is not None) and (maybe_row_count <= n_rows): + return self.execute(array_value, ordered=True) + + if not self.strictly_ordered and not array_value.node.explicitly_ordered: + # No user-provided ordering, so just get any N rows, its faster! + return self.peek(array_value, n_rows) + + plan = self.replace_cached_subtrees(array_value.node) + if not tree_properties.can_fast_head(plan): + # If can't get head fast, we are going to need to execute the whole query + # Will want to do this in a way such that the result is reusable, but the first + # N values can be easily extracted. + # This currently requires clustering on offsets. + self._cache_with_offsets(array_value) + # Get a new optimized plan after caching + plan = self.replace_cached_subtrees(array_value.node) + assert tree_properties.can_fast_head(plan) + + head_plan = generate_head_plan(plan, n_rows) + return self._execute_plan(head_plan, ordered=True) + + def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: + # TODO: Fold row count node in and use local execution + count = self._local_get_row_count(array_value) + if count is not None: + return count + else: + row_count_plan = self.replace_cached_subtrees( + generate_row_count_plan(array_value.node) + ) + results = self._execute_plan(row_count_plan, ordered=True) + pa_table = next(results.arrow_batches()) + pa_array = pa_table.column(0) + return pa_array.tolist()[0] + + def cached( + self, + array_value: bigframes.core.ArrayValue, + *, + force: bool = False, + use_session: bool = False, + cluster_cols: Sequence[str] = (), + ) -> None: + """Write the block to a session table.""" + # use a heuristic for whether something needs to be cached + if (not force) and self._is_trivially_executable(array_value): + return + if use_session: + self._cache_with_session_awareness(array_value) + else: + self._cache_with_cluster_cols(array_value, cluster_cols=cluster_cols) + + def _local_get_row_count( + self, array_value: bigframes.core.ArrayValue + ) -> Optional[int]: + # optimized plan has cache materializations which will have row count metadata + # that is more likely to be usable than original leaf nodes. + plan = self.replace_cached_subtrees(array_value.node) + return tree_properties.row_count(plan) + + # Helpers + def _run_execute_query( + self, + sql: str, + job_config: Optional[bq_job.QueryJobConfig] = None, + api_name: Optional[str] = None, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + query_with_job: bool = True, + ) -> Tuple[bq_table.RowIterator, Optional[bigquery.QueryJob]]: + """ + Starts BigQuery query job and waits for results. + """ + job_config = bq_job.QueryJobConfig() if job_config is None else job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + + if not self.strictly_ordered: + job_config.labels["bigframes-mode"] = "unordered" + + try: + iterator, query_job = bq_io.start_query_with_client( + self.bqclient, + sql, + job_config=job_config, + api_name=api_name, + max_results=max_results, + page_size=page_size, + metrics=self.metrics, + query_with_job=query_with_job, + ) + return iterator, query_job + + except google.api_core.exceptions.BadRequest as e: + # Unfortunately, this error type does not have a separate error code or exception type + if "Resources exceeded during query execution" in e.message: + new_message = "Computation is too complex to execute as a single query. Try using DataFrame.cache() on intermediate results, or setting bigframes.options.compute.enable_multi_query_execution." + raise bigframes.exceptions.QueryComplexityError(new_message) from e + else: + raise + + def replace_cached_subtrees(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: + return nodes.top_down(node, lambda x: self._cached_executions.get(x, x)) + + def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): + """ + Can the block be evaluated very cheaply? + If True, the array_value probably is not worth caching. + """ + # Once rewriting is available, will want to rewrite before + # evaluating execution cost. + return tree_properties.is_trivially_executable( + self.replace_cached_subtrees(array_value.node) + ) + + def _cache_with_cluster_cols( + self, array_value: bigframes.core.ArrayValue, cluster_cols: Sequence[str] + ): + """Executes the query and uses the resulting table to rewrite future executions.""" + + sql, schema, ordering_info = self.compiler.compile_raw( + self.replace_cached_subtrees(array_value.node) + ) + tmp_table = self._sql_as_cached_temp_table( + sql, + schema, + cluster_cols=bq_io.select_cluster_cols(schema, cluster_cols), + ) + cached_replacement = array_value.as_cached( + cache_table=self.bqclient.get_table(tmp_table), + ordering=ordering_info, + ).node + self._cached_executions[array_value.node] = cached_replacement + + def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): + """Executes the query and uses the resulting table to rewrite future executions.""" + offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") + w_offsets, offset_column = array_value.promote_offsets() + sql = self.compiler.compile( + self.replace_cached_subtrees(w_offsets.node), ordered=False + ) + + tmp_table = self._sql_as_cached_temp_table( + sql, + w_offsets.schema.to_bigquery(), + cluster_cols=[offset_column], + ) + cached_replacement = array_value.as_cached( + cache_table=self.bqclient.get_table(tmp_table), + ordering=order.TotalOrdering.from_offset_col(offset_column), + ).node + self._cached_executions[array_value.node] = cached_replacement + + def _cache_with_session_awareness( + self, + array_value: bigframes.core.ArrayValue, + ) -> None: + session_forest = [obj._block._expr.node for obj in array_value.session.objects] + # These node types are cheap to re-compute + target, cluster_cols = bigframes.session.planner.session_aware_cache_plan( + array_value.node, list(session_forest) + ) + cluster_cols_sql_names = [id.sql for id in cluster_cols] + if len(cluster_cols) > 0: + self._cache_with_cluster_cols( + bigframes.core.ArrayValue(target), cluster_cols_sql_names + ) + elif self.strictly_ordered: + self._cache_with_offsets(bigframes.core.ArrayValue(target)) + else: + self._cache_with_cluster_cols(bigframes.core.ArrayValue(target), []) + + def _simplify_with_caching(self, array_value: bigframes.core.ArrayValue): + """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" + # Apply existing caching first + for _ in range(MAX_SUBTREE_FACTORINGS): + node_with_cache = self.replace_cached_subtrees(array_value.node) + if node_with_cache.planning_complexity < QUERY_COMPLEXITY_LIMIT: + return + + did_cache = self._cache_most_complex_subtree(array_value.node) + if not did_cache: + return + + def _cache_most_complex_subtree(self, node: nodes.BigFrameNode) -> bool: + # TODO: If query fails, retry with lower complexity limit + selection = tree_properties.select_cache_target( + node, + min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), + max_complexity=QUERY_COMPLEXITY_LIMIT, + cache=dict(self._cached_executions), + # Heuristic: subtree_compleixty * (copies of subtree)^2 + heuristic=lambda complexity, count: math.log(complexity) + + 2 * math.log(count), + ) + if selection is None: + # No good subtrees to cache, just return original tree + return False + + self._cache_with_cluster_cols(bigframes.core.ArrayValue(selection), []) + return True + + def _sql_as_cached_temp_table( + self, + sql: str, + schema: Sequence[bigquery.SchemaField], + cluster_cols: Sequence[str], + ) -> bigquery.TableReference: + assert len(cluster_cols) <= _MAX_CLUSTER_COLUMNS + temp_table = self.storage_manager.create_temp_table(schema, cluster_cols) + + # TODO: Get default job config settings + job_config = cast( + bigquery.QueryJobConfig, + bigquery.QueryJobConfig.from_api_repr({}), + ) + job_config.destination = temp_table + _, query_job = self._run_execute_query( + sql, + job_config=job_config, + api_name="cached", + ) + assert query_job is not None + query_job.result() + return query_job.destination + + def _validate_result_schema( + self, + array_value: bigframes.core.ArrayValue, + bq_schema: list[bigquery.SchemaField], + ): + actual_schema = _sanitize(tuple(bq_schema)) + ibis_schema = bigframes.core.compile.test_only_ibis_inferred_schema( + self.replace_cached_subtrees(array_value.node) + ).to_bigquery() + internal_schema = _sanitize(array_value.schema.to_bigquery()) + if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + return + + if internal_schema != actual_schema: + raise ValueError( + f"This error should only occur while testing. BigFrames internal schema: {internal_schema} does not match actual schema: {actual_schema}" + ) + + if ibis_schema != actual_schema: + raise ValueError( + f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" + ) + + def _execute_plan( + self, + plan: nodes.BigFrameNode, + ordered: bool, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + destination: Optional[bq_table.TableReference] = None, + peek: Optional[int] = None, + ): + """Just execute whatever plan as is, without further caching or decomposition.""" + + # First try to execute fast-paths + # TODO: Allow page_size and max_results by rechunking/truncating results + if (not page_size) and (not max_results) and (not destination) and (not peek): + for semi_executor in self._semi_executors: + maybe_result = semi_executor.execute(plan, ordered=ordered) + if maybe_result: + return maybe_result + + # TODO(swast): plumb through the api_name of the user-facing api that + # caused this query. + job_config = bigquery.QueryJobConfig() + # Use explicit destination to avoid 10GB limit of temporary table + if destination is not None: + job_config.destination = destination + sql = self.compiler.compile(plan, ordered=ordered, limit=peek) + iterator, query_job = self._run_execute_query( + sql=sql, + job_config=job_config, + page_size=page_size, + max_results=max_results, + query_with_job=(destination is not None), + ) + + # Though we provide the read client, iterator may or may not use it based on what is efficient for the result + def iterator_supplier(): + # Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154 + if iterator._page_size is not None or iterator.max_results is not None: + return iterator.to_arrow_iterable(bqstorage_client=None) + else: + return iterator.to_arrow_iterable( + bqstorage_client=self.bqstoragereadclient + ) + + if query_job: + size_bytes = self.bqclient.get_table(query_job.destination).num_bytes + else: + size_bytes = None + + if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES: + msg = bfe.format_message( + "The query result size has exceeded 10 GB. In BigFrames 2.0 and " + "later, you might need to manually set `allow_large_results=True` in " + "the IO method or adjust the BigFrames option: " + "`bigframes.options.bigquery.allow_large_results=True`." + ) + warnings.warn(msg, FutureWarning) + # Runs strict validations to ensure internal type predictions and ibis are completely in sync + # Do not execute these validations outside of testing suite. + if "PYTEST_CURRENT_TEST" in os.environ: + self._validate_result_schema( + bigframes.core.ArrayValue(plan), iterator.schema + ) + + return executor.ExecuteResult( + arrow_batches=iterator_supplier, + schema=plan.schema, + query_job=query_job, + total_bytes=size_bytes, + total_rows=iterator.total_rows, + ) + + +def _sanitize( + schema: Tuple[bigquery.SchemaField, ...] +) -> Tuple[bigquery.SchemaField, ...]: + # Schema inferred from SQL strings and Ibis expressions contain only names, types and modes, + # so we disregard other fields (e.g timedelta description for timedelta columns) for validations. + return tuple( + bigquery.SchemaField( + f.name, + f.field_type, + f.mode, # type:ignore + fields=_sanitize(f.fields), + ) + for f in schema + ) + + +def generate_head_plan(node: nodes.BigFrameNode, n: int): + return nodes.SliceNode(node, start=None, stop=n) + + +def generate_row_count_plan(node: nodes.BigFrameNode): + return nodes.RowCountNode(node) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 2b24b6cb8b..86be8bd897 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -17,7 +17,6 @@ import os import typing from typing import Optional -import warnings import google.api_core.client_info import google.api_core.client_options @@ -32,7 +31,6 @@ import pydata_google_auth import bigframes.constants -import bigframes.exceptions as bfe import bigframes.version from . import environment @@ -43,16 +41,11 @@ # BigQuery is a REST API, which requires the protocol as part of the URL. -_BIGQUERY_LOCATIONAL_ENDPOINT = "https://{location}-bigquery.googleapis.com" _BIGQUERY_REGIONAL_ENDPOINT = "https://bigquery.{location}.rep.googleapis.com" # BigQuery Connection and Storage are gRPC APIs, which don't support the # https:// protocol in the API endpoint URL. -_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT = "{location}-bigqueryconnection.googleapis.com" -_BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT = "{location}-bigquerystorage.googleapis.com" -_BIGQUERYSTORAGE_REGIONAL_ENDPOINT = ( - "https://bigquerystorage.{location}.rep.googleapis.com" -) +_BIGQUERYSTORAGE_REGIONAL_ENDPOINT = "bigquerystorage.{location}.rep.googleapis.com" def _get_default_credentials_with_project(): @@ -114,19 +107,18 @@ def __init__( ) self._project = project - if ( - use_regional_endpoints - and location is not None - and location.lower() - not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - ): - msg = bfe.format_message( - bigframes.constants.LEP_DEPRECATION_WARNING_MESSAGE.format( - location=location - ), - fill=False, - ) - warnings.warn(msg, category=FutureWarning) + if use_regional_endpoints: + if location is None: + raise ValueError(bigframes.constants.LOCATION_NEEDED_FOR_REP_MESSAGE) + elif ( + location.lower() + not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS + ): + raise ValueError( + bigframes.constants.REP_NOT_SUPPORTED_MESSAGE.format( + location=location + ) + ) self._location = location self._use_regional_endpoints = use_regional_endpoints @@ -156,16 +148,8 @@ def _create_bigquery_client(self): api_endpoint=self._client_endpoints_override["bqclient"] ) elif self._use_regional_endpoints: - endpoint_template = _BIGQUERY_REGIONAL_ENDPOINT - if ( - self._location is not None - and self._location.lower() - not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - ): - endpoint_template = _BIGQUERY_LOCATIONAL_ENDPOINT - bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=endpoint_template.format(location=self._location) + api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format(location=self._location) ) bq_info = google.api_core.client_info.ClientInfo( @@ -212,12 +196,6 @@ def bqconnectionclient(self): bqconnection_options = google.api_core.client_options.ClientOptions( api_endpoint=self._client_endpoints_override["bqconnectionclient"] ) - elif self._use_regional_endpoints: - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT.format( - location=self._location - ) - ) bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name @@ -241,16 +219,10 @@ def bqstoragereadclient(self): api_endpoint=self._client_endpoints_override["bqstoragereadclient"] ) elif self._use_regional_endpoints: - endpoint_template = _BIGQUERYSTORAGE_REGIONAL_ENDPOINT - if ( - self._location is not None - and self._location.lower() - not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - ): - endpoint_template = _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=endpoint_template.format(location=self._location) + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) ) bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 07645c2a98..4c27c25058 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -16,53 +16,13 @@ import abc import dataclasses -import math -import os -from typing import ( - Callable, - cast, - Iterator, - Literal, - Mapping, - Optional, - Sequence, - Tuple, - Union, -) -import warnings -import weakref +from typing import Callable, Iterator, Literal, Mapping, Optional, Sequence, Union -import google.api_core.exceptions from google.cloud import bigquery -import google.cloud.bigquery.job as bq_job -import google.cloud.bigquery.table as bq_table -import google.cloud.bigquery_storage_v1 import pyarrow import bigframes.core -import bigframes.core.compile -import bigframes.core.guid -import bigframes.core.identifiers -import bigframes.core.nodes as nodes -import bigframes.core.ordering as order import bigframes.core.schema -import bigframes.core.tree_properties as tree_properties -import bigframes.dtypes -import bigframes.exceptions as bfe -import bigframes.features -import bigframes.session._io.bigquery as bq_io -import bigframes.session.metrics -import bigframes.session.planner -import bigframes.session.temp_storage - -# Max complexity that should be executed as a single query -QUERY_COMPLEXITY_LIMIT = 1e7 -# Number of times to factor out subqueries before giving up. -MAX_SUBTREE_FACTORINGS = 5 -_MAX_CLUSTER_COLUMNS = 4 -# TODO: b/338258028 Enable pruning to reduce text size. -ENABLE_PRUNING = False -MAX_SMALL_RESULT_BYTES = 10 * 1024 * 1024 * 1024 # 10G @dataclasses.dataclass(frozen=True) @@ -181,532 +141,3 @@ def cached( cluster_cols: Sequence[str] = (), ) -> None: raise NotImplementedError("cached not implemented for this executor") - - -class BigQueryCachingExecutor(Executor): - """Computes BigFrames values using BigQuery Engine. - - This executor can cache expressions. If those expressions are executed later, this session - will re-use the pre-existing results from previous executions. - - This class is not thread-safe. - """ - - def __init__( - self, - bqclient: bigquery.Client, - storage_manager: bigframes.session.temp_storage.AnonymousDatasetManager, - bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, - *, - strictly_ordered: bool = True, - metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, - ): - self.bqclient = bqclient - self.storage_manager = storage_manager - self.compiler: bigframes.core.compile.SQLCompiler = ( - bigframes.core.compile.SQLCompiler() - ) - self.strictly_ordered: bool = strictly_ordered - self._cached_executions: weakref.WeakKeyDictionary[ - nodes.BigFrameNode, nodes.BigFrameNode - ] = weakref.WeakKeyDictionary() - self.metrics = metrics - self.bqstoragereadclient = bqstoragereadclient - - def to_sql( - self, - array_value: bigframes.core.ArrayValue, - offset_column: Optional[str] = None, - ordered: bool = False, - enable_cache: bool = True, - ) -> str: - if offset_column: - array_value, internal_offset_col = array_value.promote_offsets() - node = ( - self.replace_cached_subtrees(array_value.node) - if enable_cache - else array_value.node - ) - return self.compiler.compile(node, ordered=ordered) - - def execute( - self, - array_value: bigframes.core.ArrayValue, - *, - ordered: bool = True, - use_explicit_destination: Optional[bool] = None, - page_size: Optional[int] = None, - max_results: Optional[int] = None, - ): - if use_explicit_destination is None: - use_explicit_destination = bigframes.options.bigquery.allow_large_results - - if bigframes.options.compute.enable_multi_query_execution: - self._simplify_with_caching(array_value) - - sql = self.to_sql(array_value, ordered=ordered) - job_config = bigquery.QueryJobConfig() - # Use explicit destination to avoid 10GB limit of temporary table - if use_explicit_destination: - destination_table = self.storage_manager.allocate_and_create_temp_table( - array_value.schema.to_bigquery(), cluster_cols=[] - ) - job_config.destination = destination_table - # TODO(swast): plumb through the api_name of the user-facing api that - # caused this query. - iterator, query_job = self._run_execute_query( - sql=sql, - job_config=job_config, - page_size=page_size, - max_results=max_results, - query_with_job=use_explicit_destination, - ) - - # Though we provide the read client, iterator may or may not use it based on what is efficient for the result - def iterator_supplier(): - return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient) - - if query_job: - size_bytes = self.bqclient.get_table(query_job.destination).num_bytes - else: - size_bytes = None - - if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES: - msg = bfe.format_message( - "The query result size has exceeded 10 GB. In BigFrames 2.0 and " - "later, you might need to manually set `allow_large_results=True` in " - "the IO method or adjust the BigFrames option: " - "`bigframes.options.bigquery.allow_large_results=True`." - ) - warnings.warn(msg, FutureWarning) - # Runs strict validations to ensure internal type predictions and ibis are completely in sync - # Do not execute these validations outside of testing suite. - if "PYTEST_CURRENT_TEST" in os.environ: - self._validate_result_schema(array_value, iterator.schema) - - return ExecuteResult( - arrow_batches=iterator_supplier, - schema=array_value.schema, - query_job=query_job, - total_bytes=size_bytes, - total_rows=iterator.total_rows, - ) - - def export_gbq( - self, - array_value: bigframes.core.ArrayValue, - destination: bigquery.TableReference, - if_exists: Literal["fail", "replace", "append"] = "fail", - cluster_cols: Sequence[str] = [], - ): - """ - Export the ArrayValue to an existing BigQuery table. - """ - if bigframes.options.compute.enable_multi_query_execution: - self._simplify_with_caching(array_value) - - dispositions = { - "fail": bigquery.WriteDisposition.WRITE_EMPTY, - "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, - "append": bigquery.WriteDisposition.WRITE_APPEND, - } - sql = self.to_sql(array_value, ordered=False) - job_config = bigquery.QueryJobConfig( - write_disposition=dispositions[if_exists], - destination=destination, - clustering_fields=cluster_cols if cluster_cols else None, - ) - # TODO(swast): plumb through the api_name of the user-facing api that - # caused this query. - _, query_job = self._run_execute_query( - sql=sql, - job_config=job_config, - ) - - has_timedelta_col = any( - t == bigframes.dtypes.TIMEDELTA_DTYPE for t in array_value.schema.dtypes - ) - - if if_exists != "append" and has_timedelta_col: - # Only update schema if this is not modifying an existing table, and the - # new table contains timedelta columns. - table = self.bqclient.get_table(destination) - table.schema = array_value.schema.to_bigquery() - self.bqclient.update_table(table, ["schema"]) - - return query_job - - def export_gcs( - self, - array_value: bigframes.core.ArrayValue, - uri: str, - format: Literal["json", "csv", "parquet"], - export_options: Mapping[str, Union[bool, str]], - ): - query_job = self.execute( - array_value, - ordered=False, - use_explicit_destination=True, - ).query_job - result_table = query_job.destination - export_data_statement = bq_io.create_export_data_statement( - f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", - uri=uri, - format=format, - export_options=dict(export_options), - ) - - bq_io.start_query_with_client( - self.bqclient, - export_data_statement, - job_config=bigquery.QueryJobConfig(), - api_name=f"dataframe-to_{format.lower()}", - metrics=self.metrics, - ) - return query_job - - def dry_run( - self, array_value: bigframes.core.ArrayValue, ordered: bool = True - ) -> bigquery.QueryJob: - sql = self.to_sql(array_value, ordered=ordered) - job_config = bigquery.QueryJobConfig(dry_run=True) - query_job = self.bqclient.query(sql, job_config=job_config) - return query_job - - def peek( - self, - array_value: bigframes.core.ArrayValue, - n_rows: int, - use_explicit_destination: Optional[bool] = None, - ) -> ExecuteResult: - """ - A 'peek' efficiently accesses a small number of rows in the dataframe. - """ - plan = self.replace_cached_subtrees(array_value.node) - if not tree_properties.can_fast_peek(plan): - msg = bfe.format_message("Peeking this value cannot be done efficiently.") - warnings.warn(msg) - if use_explicit_destination is None: - use_explicit_destination = bigframes.options.bigquery.allow_large_results - - job_config = bigquery.QueryJobConfig() - # Use explicit destination to avoid 10GB limit of temporary table - if use_explicit_destination: - destination_table = self.storage_manager.allocate_and_create_temp_table( - array_value.schema.to_bigquery(), cluster_cols=[] - ) - job_config.destination = destination_table - - sql = self.compiler.compile(plan, ordered=False, limit=n_rows) - - # TODO(swast): plumb through the api_name of the user-facing api that - # caused this query. - iterator, query_job = self._run_execute_query( - sql=sql, job_config=job_config, query_with_job=use_explicit_destination - ) - return ExecuteResult( - # Probably don't need read client for small peek results, but let client decide - arrow_batches=lambda: iterator.to_arrow_iterable( - bqstorage_client=self.bqstoragereadclient - ), - schema=array_value.schema, - query_job=query_job, - total_rows=iterator.total_rows, - ) - - def head( - self, array_value: bigframes.core.ArrayValue, n_rows: int - ) -> ExecuteResult: - - maybe_row_count = self._local_get_row_count(array_value) - if (maybe_row_count is not None) and (maybe_row_count <= n_rows): - return self.execute(array_value, ordered=True) - - if not self.strictly_ordered and not array_value.node.explicitly_ordered: - # No user-provided ordering, so just get any N rows, its faster! - return self.peek(array_value, n_rows) - - plan = self.replace_cached_subtrees(array_value.node) - if not tree_properties.can_fast_head(plan): - # If can't get head fast, we are going to need to execute the whole query - # Will want to do this in a way such that the result is reusable, but the first - # N values can be easily extracted. - # This currently requires clustering on offsets. - self._cache_with_offsets(array_value) - # Get a new optimized plan after caching - plan = self.replace_cached_subtrees(array_value.node) - assert tree_properties.can_fast_head(plan) - - head_plan = generate_head_plan(plan, n_rows) - sql = self.compiler.compile(head_plan) - - # TODO(swast): plumb through the api_name of the user-facing api that - # caused this query. - iterator, query_job = self._run_execute_query(sql=sql) - return ExecuteResult( - # Probably don't need read client for small head results, but let client decide - arrow_batches=lambda: iterator.to_arrow_iterable( - bqstorage_client=self.bqstoragereadclient - ), - schema=array_value.schema, - query_job=query_job, - total_rows=iterator.total_rows, - ) - - def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: - count = self._local_get_row_count(array_value) - if count is not None: - return count - else: - row_count_plan = self.replace_cached_subtrees( - generate_row_count_plan(array_value.node) - ) - sql = self.compiler.compile(row_count_plan, ordered=False) - iter, _ = self._run_execute_query(sql, query_with_job=False) - return next(iter)[0] - - def cached( - self, - array_value: bigframes.core.ArrayValue, - *, - force: bool = False, - use_session: bool = False, - cluster_cols: Sequence[str] = (), - ) -> None: - """Write the block to a session table.""" - # use a heuristic for whether something needs to be cached - if (not force) and self._is_trivially_executable(array_value): - return - if use_session: - self._cache_with_session_awareness(array_value) - else: - self._cache_with_cluster_cols(array_value, cluster_cols=cluster_cols) - - def _local_get_row_count( - self, array_value: bigframes.core.ArrayValue - ) -> Optional[int]: - # optimized plan has cache materializations which will have row count metadata - # that is more likely to be usable than original leaf nodes. - plan = self.replace_cached_subtrees(array_value.node) - return tree_properties.row_count(plan) - - # Helpers - def _run_execute_query( - self, - sql: str, - job_config: Optional[bq_job.QueryJobConfig] = None, - api_name: Optional[str] = None, - page_size: Optional[int] = None, - max_results: Optional[int] = None, - query_with_job: bool = True, - ) -> Tuple[bq_table.RowIterator, Optional[bigquery.QueryJob]]: - """ - Starts BigQuery query job and waits for results. - """ - job_config = bq_job.QueryJobConfig() if job_config is None else job_config - if bigframes.options.compute.maximum_bytes_billed is not None: - job_config.maximum_bytes_billed = ( - bigframes.options.compute.maximum_bytes_billed - ) - - if not self.strictly_ordered: - job_config.labels["bigframes-mode"] = "unordered" - - # Note: add_and_trim_labels is global scope which may have unexpected effects - # Ensure no additional labels are added to job_config after this point, - # as `add_and_trim_labels` ensures the label count does not exceed 64. - bq_io.add_and_trim_labels(job_config, api_name=api_name) - try: - iterator, query_job = bq_io.start_query_with_client( - self.bqclient, - sql, - job_config=job_config, - api_name=api_name, - max_results=max_results, - page_size=page_size, - metrics=self.metrics, - query_with_job=query_with_job, - ) - return iterator, query_job - - except google.api_core.exceptions.BadRequest as e: - # Unfortunately, this error type does not have a separate error code or exception type - if "Resources exceeded during query execution" in e.message: - new_message = "Computation is too complex to execute as a single query. Try using DataFrame.cache() on intermediate results, or setting bigframes.options.compute.enable_multi_query_execution." - raise bigframes.exceptions.QueryComplexityError(new_message) from e - else: - raise - - def replace_cached_subtrees(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down(node, lambda x: self._cached_executions.get(x, x)) - - def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): - """ - Can the block be evaluated very cheaply? - If True, the array_value probably is not worth caching. - """ - # Once rewriting is available, will want to rewrite before - # evaluating execution cost. - return tree_properties.is_trivially_executable( - self.replace_cached_subtrees(array_value.node) - ) - - def _cache_with_cluster_cols( - self, array_value: bigframes.core.ArrayValue, cluster_cols: Sequence[str] - ): - """Executes the query and uses the resulting table to rewrite future executions.""" - - sql, schema, ordering_info = self.compiler.compile_raw( - self.replace_cached_subtrees(array_value.node) - ) - tmp_table = self._sql_as_cached_temp_table( - sql, - schema, - cluster_cols=bq_io.select_cluster_cols(schema, cluster_cols), - ) - cached_replacement = array_value.as_cached( - cache_table=self.bqclient.get_table(tmp_table), - ordering=ordering_info, - ).node - self._cached_executions[array_value.node] = cached_replacement - - def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): - """Executes the query and uses the resulting table to rewrite future executions.""" - offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") - w_offsets, offset_column = array_value.promote_offsets() - sql = self.compiler.compile( - self.replace_cached_subtrees(w_offsets.node), ordered=False - ) - - tmp_table = self._sql_as_cached_temp_table( - sql, - w_offsets.schema.to_bigquery(), - cluster_cols=[offset_column], - ) - cached_replacement = array_value.as_cached( - cache_table=self.bqclient.get_table(tmp_table), - ordering=order.TotalOrdering.from_offset_col(offset_column), - ).node - self._cached_executions[array_value.node] = cached_replacement - - def _cache_with_session_awareness( - self, - array_value: bigframes.core.ArrayValue, - ) -> None: - session_forest = [obj._block._expr.node for obj in array_value.session.objects] - # These node types are cheap to re-compute - target, cluster_cols = bigframes.session.planner.session_aware_cache_plan( - array_value.node, list(session_forest) - ) - cluster_cols_sql_names = [id.sql for id in cluster_cols] - if len(cluster_cols) > 0: - self._cache_with_cluster_cols( - bigframes.core.ArrayValue(target), cluster_cols_sql_names - ) - elif self.strictly_ordered: - self._cache_with_offsets(bigframes.core.ArrayValue(target)) - else: - self._cache_with_cluster_cols(bigframes.core.ArrayValue(target), []) - - def _simplify_with_caching(self, array_value: bigframes.core.ArrayValue): - """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" - # Apply existing caching first - for _ in range(MAX_SUBTREE_FACTORINGS): - node_with_cache = self.replace_cached_subtrees(array_value.node) - if node_with_cache.planning_complexity < QUERY_COMPLEXITY_LIMIT: - return - - did_cache = self._cache_most_complex_subtree(array_value.node) - if not did_cache: - return - - def _cache_most_complex_subtree(self, node: nodes.BigFrameNode) -> bool: - # TODO: If query fails, retry with lower complexity limit - selection = tree_properties.select_cache_target( - node, - min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), - max_complexity=QUERY_COMPLEXITY_LIMIT, - cache=dict(self._cached_executions), - # Heuristic: subtree_compleixty * (copies of subtree)^2 - heuristic=lambda complexity, count: math.log(complexity) - + 2 * math.log(count), - ) - if selection is None: - # No good subtrees to cache, just return original tree - return False - - self._cache_with_cluster_cols(bigframes.core.ArrayValue(selection), []) - return True - - def _sql_as_cached_temp_table( - self, - sql: str, - schema: Sequence[bigquery.SchemaField], - cluster_cols: Sequence[str], - ) -> bigquery.TableReference: - assert len(cluster_cols) <= _MAX_CLUSTER_COLUMNS - temp_table = self.storage_manager.allocate_and_create_temp_table( - schema, cluster_cols - ) - - # TODO: Get default job config settings - job_config = cast( - bigquery.QueryJobConfig, - bigquery.QueryJobConfig.from_api_repr({}), - ) - job_config.destination = temp_table - _, query_job = self._run_execute_query( - sql, - job_config=job_config, - api_name="cached", - ) - assert query_job is not None - query_job.result() - return query_job.destination - - def _validate_result_schema( - self, - array_value: bigframes.core.ArrayValue, - bq_schema: list[bigquery.SchemaField], - ): - actual_schema = _sanitize(tuple(bq_schema)) - ibis_schema = bigframes.core.compile.test_only_ibis_inferred_schema( - self.replace_cached_subtrees(array_value.node) - ).to_bigquery() - internal_schema = _sanitize(array_value.schema.to_bigquery()) - if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - return - - if internal_schema != actual_schema: - raise ValueError( - f"This error should only occur while testing. BigFrames internal schema: {internal_schema} does not match actual schema: {actual_schema}" - ) - - if ibis_schema != actual_schema: - raise ValueError( - f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" - ) - - -def _sanitize( - schema: Tuple[bigquery.SchemaField, ...] -) -> Tuple[bigquery.SchemaField, ...]: - # Schema inferred from SQL strings and Ibis expressions contain only names, types and modes, - # so we disregard other fields (e.g timedelta description for timedelta columns) for validations. - return tuple( - bigquery.SchemaField( - f.name, - f.field_type, - f.mode, # type:ignore - fields=_sanitize(f.fields), - ) - for f in schema - ) - - -def generate_head_plan(node: nodes.BigFrameNode, n: int): - return nodes.SliceNode(node, start=None, stop=n) - - -def generate_row_count_plan(node: nodes.BigFrameNode): - return nodes.RowCountNode(node) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 1296e9d1b3..bdcada6364 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -17,60 +17,62 @@ import copy import dataclasses import datetime +import io import itertools import os import typing -from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union +from typing import ( + Dict, + Hashable, + IO, + Iterable, + List, + Literal, + Optional, + Sequence, + Tuple, +) import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions -import google.auth.credentials import google.cloud.bigquery as bigquery import google.cloud.bigquery.table -import google.cloud.bigquery_connection_v1 -import google.cloud.bigquery_storage_v1 -import google.cloud.functions_v2 -import google.cloud.resourcemanager_v3 -import jellyfish import pandas -import pandas_gbq.schema.pandas_to_bigquery # type: ignore +import pyarrow as pa -import bigframes.clients -import bigframes.constants +from bigframes.core import local_data, utils import bigframes.core as core import bigframes.core.blocks as blocks -import bigframes.core.compile -import bigframes.core.expression as expression -import bigframes.core.guid -import bigframes.core.ordering -import bigframes.core.pruning import bigframes.core.schema as schemata -import bigframes.dataframe import bigframes.dtypes -import bigframes.exceptions import bigframes.formatting_helpers as formatting_helpers -import bigframes.operations -import bigframes.operations.aggregations as agg_ops import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table -import bigframes.session._io.pandas as bf_io_pandas -import bigframes.session.clients -import bigframes.session.executor import bigframes.session.metrics -import bigframes.session.planner -import bigframes.session.temp_storage +import bigframes.session.temporary_storage import bigframes.session.time as session_time -import bigframes.version # Avoid circular imports. if typing.TYPE_CHECKING: - import bigframes.core.indexes import bigframes.dataframe as dataframe - import bigframes.series import bigframes.session -_MAX_CLUSTER_COLUMNS = 4 +_PLACEHOLDER_SCHEMA = ( + google.cloud.bigquery.SchemaField("bf_loader_placeholder", "INTEGER"), +) + +_LOAD_JOB_TYPE_OVERRIDES = { + # Json load jobs not supported yet: b/271321143 + bigframes.dtypes.JSON_DTYPE: "STRING", + # Timedelta is emulated using integer in bq type system + bigframes.dtypes.TIMEDELTA_DTYPE: "INTEGER", +} + +_STREAM_JOB_TYPE_OVERRIDES = { + # Timedelta is emulated using integer in bq type system + bigframes.dtypes.TIMEDELTA_DTYPE: "INTEGER", +} def _to_index_cols( @@ -87,6 +89,31 @@ def _to_index_cols( return index_cols +def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]): + index_cols_list = list(index_cols) if index_cols is not None else [] + columns_list = list(columns) if columns is not None else [] + set_index = set(index_cols_list) + set_columns = set(columns_list) + + if len(index_cols_list) > len(set_index): + raise ValueError( + "The 'index_col' argument contains duplicate names. " + "All column names specified in 'index_col' must be unique." + ) + + if len(columns_list) > len(set_columns): + raise ValueError( + "The 'columns' argument contains duplicate names. " + "All column names specified in 'columns' must be unique." + ) + + if not set_index.isdisjoint(set_columns): + raise ValueError( + "Found column names that exist in both 'index_col' and 'columns' arguments. " + "These arguments must specify distinct sets of columns." + ) + + @dataclasses.dataclass class GbqDataLoader: """ @@ -115,7 +142,7 @@ def __init__( self, session: bigframes.session.Session, bqclient: bigquery.Client, - storage_manager: bigframes.session.temp_storage.AnonymousDatasetManager, + storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager, default_index_type: bigframes.enums.DefaultIndexKind, scan_index_uniqueness: bool, force_total_order: bool, @@ -135,136 +162,120 @@ def __init__( self._clock = session_time.BigQuerySyncedClock(bqclient) self._clock.sync() - def read_pandas_load_job( - self, pandas_dataframe: pandas.DataFrame, api_name: str + def read_pandas( + self, + pandas_dataframe: pandas.DataFrame, + method: Literal["load", "stream"], + api_name: str, ) -> dataframe.DataFrame: - import bigframes.dataframe as dataframe + # TODO: Push this into from_pandas, along with index flag + from bigframes import dataframe - df_and_labels = bf_io_pandas.pandas_to_bq_compatible(pandas_dataframe) - pandas_dataframe_copy = df_and_labels.df - new_idx_ids = pandas_dataframe_copy.index.names - ordering_col = df_and_labels.ordering_col - - # TODO(https://github.com/googleapis/python-bigquery-pandas/issues/760): - # Once pandas-gbq can show a link to the running load job like - # bigframes does, switch to using pandas-gbq to load the - # bigquery-compatible pandas DataFrame. - schema: list[ - bigquery.SchemaField - ] = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( - pandas_dataframe_copy, - index=True, + val_cols, idx_cols = utils.get_standardized_ids( + pandas_dataframe.columns, pandas_dataframe.index.names, strict=True + ) + prepared_df = pandas_dataframe.reset_index(drop=False).set_axis( + [*idx_cols, *val_cols], axis="columns" ) + managed_data = local_data.ManagedArrowTable.from_pandas(prepared_df) - job_config = bigquery.LoadJobConfig() - job_config.schema = schema + if method == "load": + array_value = self.load_data(managed_data, api_name=api_name) + elif method == "stream": + array_value = self.stream_data(managed_data) + else: + raise ValueError(f"Unsupported read method {method}") - # TODO: Remove this. It's likely that the slower load job due to - # clustering doesn't improve speed of queries because pandas tables are - # small. - cluster_cols = [ordering_col] - job_config.clustering_fields = cluster_cols + block = blocks.Block( + array_value, + index_columns=idx_cols, + column_labels=pandas_dataframe.columns, + index_labels=pandas_dataframe.index.names, + ) + return dataframe.DataFrame(block) - job_config.labels = {"bigframes-api": api_name} + def load_data( + self, data: local_data.ManagedArrowTable, api_name: Optional[str] = None + ) -> core.ArrayValue: + """Load managed data into bigquery""" + ordering_col = "bf_load_job_offsets" - load_table_destination = self._storage_manager.allocate_temp_table() - load_job = self._bqclient.load_table_from_dataframe( - pandas_dataframe_copy, - load_table_destination, - job_config=job_config, + # JSON support incomplete + for item in data.schema.items: + _validate_dtype_can_load(item.column, item.dtype) + + schema_w_offsets = data.schema.append( + schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) ) - self._start_generic_job(load_job) + bq_schema = schema_w_offsets.to_bigquery(_LOAD_JOB_TYPE_OVERRIDES) + job_config = bigquery.LoadJobConfig() + job_config.source_format = bigquery.SourceFormat.PARQUET + job_config.schema = bq_schema + if api_name: + job_config.labels = {"bigframes-api": api_name} + + load_table_destination = self._storage_manager.create_temp_table( + bq_schema, [ordering_col] + ) + + buffer = io.BytesIO() + data.to_parquet( + buffer, + offsets_col=ordering_col, + geo_format="wkt", + duration_type="duration", + json_type="string", + ) + buffer.seek(0) + load_job = self._bqclient.load_table_from_file( + buffer, destination=load_table_destination, job_config=job_config + ) + self._start_generic_job(load_job) + # must get table metadata after load job for accurate metadata destination_table = self._bqclient.get_table(load_table_destination) - array_value = core.ArrayValue.from_table( + return core.ArrayValue.from_table( table=destination_table, - # TODO (b/394156190): Generate this directly from original pandas df. - schema=schemata.ArraySchema.from_bq_table( - destination_table, df_and_labels.col_type_overrides - ), + schema=schema_w_offsets, session=self._session, offsets_col=ordering_col, + n_rows=data.data.num_rows, ).drop_columns([ordering_col]) - block = blocks.Block( - array_value, - index_columns=new_idx_ids, - column_labels=df_and_labels.column_labels, - index_labels=df_and_labels.index_labels, + def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue: + """Load managed data into bigquery""" + ordering_col = "bf_stream_job_offsets" + schema_w_offsets = data.schema.append( + schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE) ) - return dataframe.DataFrame(block) - - def read_pandas_streaming( - self, - pandas_dataframe: pandas.DataFrame, - ) -> dataframe.DataFrame: - """Same as pandas_to_bigquery_load, but uses the BQ legacy streaming API.""" - import bigframes.dataframe as dataframe - - df_and_labels = bf_io_pandas.pandas_to_bq_compatible(pandas_dataframe) - pandas_dataframe_copy = df_and_labels.df - new_idx_ids = pandas_dataframe_copy.index.names - ordering_col = df_and_labels.ordering_col - - # TODO(https://github.com/googleapis/python-bigquery-pandas/issues/300): - # Once pandas-gbq can do streaming inserts (again), switch to using - # pandas-gbq to write the bigquery-compatible pandas DataFrame. - schema: list[ - bigquery.SchemaField - ] = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( - pandas_dataframe_copy, - index=True, + bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES) + load_table_destination = self._storage_manager.create_temp_table( + bq_schema, [ordering_col] ) - destination = self._storage_manager.allocate_and_create_temp_table( - schema, - [ordering_col], + rows = data.itertuples( + geo_format="wkt", duration_type="int", json_type="object" ) - destination_table = bigquery.Table(destination, schema=schema) - # TODO(swast): Confirm that the index is written. - for errors in self._bqclient.insert_rows_from_dataframe( - destination_table, - pandas_dataframe_copy, + rows_w_offsets = ((*row, offset) for offset, row in enumerate(rows)) + + for errors in self._bqclient.insert_rows( + load_table_destination, + rows_w_offsets, + selected_fields=bq_schema, + row_ids=map(str, itertools.count()), # used to ensure only-once insertion ): if errors: raise ValueError( f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}" ) - array_value = ( - core.ArrayValue.from_table( - table=destination_table, - schema=schemata.ArraySchema.from_bq_table( - destination_table, df_and_labels.col_type_overrides - ), - session=self._session, - # Don't set the offsets column because we want to group by it. - ) - # There may be duplicate rows because of hidden retries, so use a query to - # deduplicate based on the ordering ID, which is guaranteed to be unique. - # We know that rows with same ordering ID are duplicates, - # so ANY_VALUE() is deterministic. - .aggregate( - by_column_ids=[ordering_col], - aggregations=[ - ( - expression.UnaryAggregation( - agg_ops.AnyValueOp(), - expression.deref(field.name), - ), - field.name, - ) - for field in destination_table.schema - if field.name != ordering_col - ], - ).drop_columns([ordering_col]) - ) - block = blocks.Block( - array_value, - index_columns=new_idx_ids, - column_labels=df_and_labels.column_labels, - index_labels=df_and_labels.index_labels, - ) - return dataframe.DataFrame(block) + destination_table = self._bqclient.get_table(load_table_destination) + return core.ArrayValue.from_table( + table=destination_table, + schema=schema_w_offsets, + session=self._session, + offsets_col=ordering_col, + n_rows=data.data.num_rows, + ).drop_columns([ordering_col]) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: @@ -281,11 +292,12 @@ def read_gbq_table( index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), max_results: Optional[int] = None, - api_name: str, + api_name: str = "read_gbq_table", use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), enable_snapshot: bool = True, ) -> dataframe.DataFrame: + import bigframes._tools.strings import bigframes.dataframe as dataframe # --------------------------------- @@ -326,7 +338,9 @@ def read_gbq_table( if key not in table_column_names: possibility = min( table_column_names, - key=lambda item: jellyfish.levenshtein_distance(key, item), + key=lambda item: bigframes._tools.strings.levenshtein_distance( + key, item + ), ) raise ValueError( f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?" @@ -339,12 +353,15 @@ def read_gbq_table( table=table, index_col=index_col, ) + _check_column_duplicates(index_cols, columns) for key in index_cols: if key not in table_column_names: possibility = min( table_column_names, - key=lambda item: jellyfish.levenshtein_distance(key, item), + key=lambda item: bigframes._tools.strings.levenshtein_distance( + key, item + ), ) raise ValueError( f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?" @@ -384,7 +401,7 @@ def read_gbq_table( query, index_col=index_cols, columns=columns, - api_name="read_gbq_table", + api_name=api_name, use_cache=use_cache, ) @@ -494,29 +511,28 @@ def read_gbq_table( df.sort_index() return df - def _read_bigquery_load_job( + def read_bigquery_load_job( self, filepath_or_buffer: str | IO["bytes"], - table: Union[bigquery.Table, bigquery.TableReference], *, job_config: bigquery.LoadJobConfig, index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), ) -> dataframe.DataFrame: - index_cols = _to_index_cols(index_col) - - if not job_config.clustering_fields and index_cols: - job_config.clustering_fields = index_cols[:_MAX_CLUSTER_COLUMNS] - + # Need to create session table beforehand + table = self._storage_manager.create_temp_table(_PLACEHOLDER_SCHEMA) + # but, we just overwrite the placeholder schema immediately with the load job + job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE if isinstance(filepath_or_buffer, str): + filepath_or_buffer = os.path.expanduser(filepath_or_buffer) if filepath_or_buffer.startswith("gs://"): load_job = self._bqclient.load_table_from_uri( - filepath_or_buffer, table, job_config=job_config + filepath_or_buffer, destination=table, job_config=job_config ) elif os.path.exists(filepath_or_buffer): # local file path with open(filepath_or_buffer, "rb") as source_file: load_job = self._bqclient.load_table_from_file( - source_file, table, job_config=job_config + source_file, destination=table, job_config=job_config ) else: raise NotImplementedError( @@ -525,21 +541,12 @@ def _read_bigquery_load_job( ) else: load_job = self._bqclient.load_table_from_file( - filepath_or_buffer, table, job_config=job_config + filepath_or_buffer, destination=table, job_config=job_config ) self._start_generic_job(load_job) table_id = f"{table.project}.{table.dataset_id}.{table.table_id}" - # Update the table expiration so we aren't limited to the default 24 - # hours of the anonymous dataset. - table_expiration = bigquery.Table(table_id) - table_expiration.expires = ( - datetime.datetime.now(datetime.timezone.utc) - + bigframes.constants.DEFAULT_EXPIRATION - ) - self._bqclient.update_table(table_expiration, ["expires"]) - # The BigQuery REST API for tables.get doesn't take a session ID, so we # can't get the schema for a temp table that way. @@ -588,6 +595,7 @@ def read_gbq_query( ) index_cols = _to_index_cols(index_col) + _check_column_duplicates(index_cols, columns) filters_copy1, filters_copy2 = itertools.tee(filters) has_filters = len(list(filters_copy1)) != 0 @@ -673,9 +681,7 @@ def _query_to_destination( ) else: cluster_cols = [] - temp_table = self._storage_manager.allocate_and_create_temp_table( - schema, cluster_cols - ) + temp_table = self._storage_manager.create_temp_table(schema, cluster_cols) timeout_ms = configuration.get("jobTimeoutMs") or configuration["query"].get( "timeoutMs" @@ -761,3 +767,44 @@ def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: configuration["jobTimeoutMs"] = timeout_ms return configuration + + +def _has_json_arrow_type(arrow_type: pa.DataType) -> bool: + """ + Searches recursively for JSON array type within a PyArrow DataType. + """ + if arrow_type == bigframes.dtypes.JSON_ARROW_TYPE: + return True + if pa.types.is_list(arrow_type): + return _has_json_arrow_type(arrow_type.value_type) + if pa.types.is_struct(arrow_type): + for i in range(arrow_type.num_fields): + if _has_json_arrow_type(arrow_type.field(i).type): + return True + return False + return False + + +def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): + """ + Determines whether a datatype is supported by bq load jobs. + + Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249), + we're using a workaround: storing JSON as strings and then parsing them into JSON + objects. + TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. + + Raises: + NotImplementedError: Type is not yet supported by load jobs. + """ + # we can handle top-level json, but not nested yet through string conversion + if column_type == bigframes.dtypes.JSON_DTYPE: + return + + if isinstance(column_type, pandas.ArrowDtype) and _has_json_arrow_type( + column_type.pyarrow_dtype + ): + raise NotImplementedError( + f"Nested JSON types, found in column `{name}`: `{column_type}`', " + f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" + ) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py new file mode 100644 index 0000000000..32095e41f4 --- /dev/null +++ b/bigframes/session/read_api_execution.py @@ -0,0 +1,100 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Any, Optional + +from google.cloud import bigquery_storage_v1 + +from bigframes.core import bigframe_node, rewrite +from bigframes.session import executor, semi_executor + + +class ReadApiSemiExecutor(semi_executor.SemiExecutor): + """ + Executes plans reducible to a bq table scan by directly reading the table with the read api. + """ + + def __init__( + self, bqstoragereadclient: bigquery_storage_v1.BigQueryReadClient, project: str + ): + self.bqstoragereadclient = bqstoragereadclient + self.project = project + + def execute( + self, + plan: bigframe_node.BigFrameNode, + ordered: bool, + peek: Optional[int] = None, + ) -> Optional[executor.ExecuteResult]: + node = rewrite.try_reduce_to_table_scan(plan) + if not node: + return None + if node.explicitly_ordered and ordered: + return None + if peek: + # TODO: Support peeking + return None + + import google.cloud.bigquery_storage_v1.types as bq_storage_types + from google.protobuf import timestamp_pb2 + + bq_table = node.source.table.get_table_ref() + read_options: dict[str, Any] = { + "selected_fields": [item.source_id for item in node.scan_list.items] + } + if node.source.sql_predicate: + read_options["row_restriction"] = node.source.sql_predicate + read_options = bq_storage_types.ReadSession.TableReadOptions(**read_options) + + table_mod_options = {} + if node.source.at_time: + snapshot_time = timestamp_pb2.Timestamp() + snapshot_time.FromDatetime(node.source.at_time) + table_mod_options["snapshot_time"] = snapshot_time = snapshot_time + table_mods = bq_storage_types.ReadSession.TableModifiers(**table_mod_options) + + def iterator_supplier(): + requested_session = bq_storage_types.stream.ReadSession( + table=bq_table.to_bqstorage(), + data_format=bq_storage_types.DataFormat.ARROW, + read_options=read_options, + table_modifiers=table_mods, + ) + # Single stream to maintain ordering + request = bq_storage_types.CreateReadSessionRequest( + parent=f"projects/{self.project}", + read_session=requested_session, + max_stream_count=1, + ) + session = self.bqstoragereadclient.create_read_session( + request=request, retry=None + ) + + if not session.streams: + return iter([]) + + reader = self.bqstoragereadclient.read_rows( + session.streams[0].name, retry=None + ) + rowstream = reader.rows() + return map(lambda page: page.to_arrow(), rowstream.pages) + + return executor.ExecuteResult( + arrow_batches=iterator_supplier, + schema=plan.schema, + query_job=None, + total_bytes=None, + total_rows=node.source.n_rows, + ) diff --git a/bigframes/session/semi_executor.py b/bigframes/session/semi_executor.py new file mode 100644 index 0000000000..c41d7c96d3 --- /dev/null +++ b/bigframes/session/semi_executor.py @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import abc +from typing import Optional + +from bigframes.core import bigframe_node +from bigframes.session import executor + + +# Unstable interface, in development +class SemiExecutor(abc.ABC): + """ + A semi executor executes a subset of possible plans, returns None for unsupported plans. + """ + + def execute( + self, + plan: bigframe_node.BigFrameNode, + ordered: bool, + peek: Optional[int] = None, + ) -> Optional[executor.ExecuteResult]: + raise NotImplementedError("execute not implemented for this executor") diff --git a/bigframes/session/temporary_storage.py b/bigframes/session/temporary_storage.py new file mode 100644 index 0000000000..0c2a36f3fe --- /dev/null +++ b/bigframes/session/temporary_storage.py @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Protocol, Sequence + +from google.cloud import bigquery + + +class TemporaryStorageManager(Protocol): + @property + def location(self) -> str: + ... + + def create_temp_table( + self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] + ) -> bigquery.TableReference: + ... + + # implementations should be robust to repeatedly closing + def close(self) -> None: + ... diff --git a/bigframes/testing/__init__.py b/bigframes/testing/__init__.py new file mode 100644 index 0000000000..529c08241d --- /dev/null +++ b/bigframes/testing/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""[Experimental] Utilities for testing BigQuery DataFrames. + +These modules are provided for testing the BigQuery DataFrames package. The +interface is not considered stable. +""" diff --git a/tests/unit/resources.py b/bigframes/testing/mocks.py similarity index 90% rename from tests/unit/resources.py rename to bigframes/testing/mocks.py index ebc1243eaf..ab48b97f0d 100644 --- a/tests/unit/resources.py +++ b/bigframes/testing/mocks.py @@ -36,12 +36,17 @@ def create_bigquery_session( + *, bqclient: Optional[mock.Mock] = None, session_id: str = "abcxyz", table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA, anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, location: str = "test-region", ) -> bigframes.Session: + """[Experimental] Create a mock BigQuery DataFrames session that avoids making Google Cloud API calls. + + Intended for unit test environments that don't have access to the network. + """ credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) @@ -108,8 +113,12 @@ def query_and_wait_mock(query, *args, **kwargs): def create_dataframe( - monkeypatch: pytest.MonkeyPatch, session: Optional[bigframes.Session] = None + monkeypatch: pytest.MonkeyPatch, *, session: Optional[bigframes.Session] = None ) -> bigframes.dataframe.DataFrame: + """[Experimental] Create a mock DataFrame that avoids making Google Cloud API calls. + + Intended for unit test environments that don't have access to the network. + """ if session is None: session = create_bigquery_session() diff --git a/bigframes/version.py b/bigframes/version.py index 356e73a71d..e3a1d84bfa 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.42.0" +__version__ = "2.0.0" # {x-release-please-start-date} -__release_date__ = "2025-03-27" +__release_date__ = "2025-04-17" # {x-release-please-end} diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b00044b087..1e712848df 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -86,6 +86,9 @@ uid: bigframes.operations.structs.StructAccessor - name: PlotAccessor uid: bigframes.operations.plotting.PlotAccessor + - name: BlobAccessor + uid: bigframes.operations.blob.BlobAccessor + status: beta name: Series - name: Window uid: bigframes.core.window.Window diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb index c190f219af..f830e35c16 100644 --- a/notebooks/apps/synthetic_data_generation.ipynb +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -248,8 +248,8 @@ }, "outputs": [], "source": [ - "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", - "def data_generator(id):\n", + "@bpd.remote_function(packages=['faker', 'pandas'], cloud_function_service_account=\"default\")\n", + "def data_generator(id: int) -> str:\n", " context = {}\n", " exec(code, context)\n", " result_df = context.get(\"result_df\")\n", diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb new file mode 100644 index 0000000000..9f35d3864a --- /dev/null +++ b/notebooks/experimental/ai_operators.ipynb @@ -0,0 +1,3178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "UYeZd_I8iouP" + }, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rWJnGj2ViouP" + }, + "source": [ + "# BigFrames AI Operator Tutorial\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mgOrr256iouQ" + }, + "source": [ + "This notebook provides a hands-on preview of AI operator APIs powered by the Gemini model.\n", + "\n", + "The notebook is divided into two sections. The first section introduces the API syntax with examples, aiming to familiarize you with how AI operators work. The second section applies AI operators to a large real-world dataset and presents performance statistics.\n", + "\n", + "This work is inspired by [this paper](https://arxiv.org/pdf/2407.11418) and powered by BigQuery ML and Vertex AI." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2ymVbJV2iouQ" + }, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vvVzFzo3iouQ" + }, + "source": [ + "First, import the BigFrames modules.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Jb9glT2ziouQ" + }, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xQiCWj7OiouQ" + }, + "source": [ + "Make sure the BigFrames version is at least `1.42.0`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "LTPpI8IpiouQ" + }, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.42.0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "agxLmtlbiouR" + }, + "source": [ + "Turn on the AI operator experiment. You will see a warning sign saying that these operators are still under experiments. If you don't turn on the experiment before using the operators, you will get `NotImplemenetedError`s." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "1wXqdDr8iouR" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:54: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", + "the future.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.ai_operators = True" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W8TPUvnsqxhv" + }, + "source": [ + "Specify your GCP project and location." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vCkraKOeqJFl" + }, + "outputs": [], + "source": [ + "bpd.options.bigquery.project = 'YOUR_PROJECT_ID'\n", + "bpd.options.bigquery.location = 'US'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n95MFlS0iouR" + }, + "source": [ + "**Optional**: turn off the display of progress bar so that only the operation results will be printed out" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "5r6ahx7MiouR" + }, + "outputs": [], + "source": [ + "bpd.options.display.progress_bar = None" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "93iYvp7niouR" + }, + "source": [ + "Create LLM instances. They will be passed in as parameters for each AI operator.\n", + "\n", + "This tutorial uses the \"gemini-2.0-flash-001\" model for text generation and \"text-embedding-005\" for embedding. While these are recommended, you can choose [other Vertex AI LLM models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) based on your needs and availability. Ensure you have [sufficient quota](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas) for your chosen models and adjust it if necessary." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "tHkymaLNiouR" + }, + "outputs": [], + "source": [ + "from bigframes.ml import llm\n", + "gemini_model = llm.GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mbFDcvnPiouR" + }, + "source": [ + "**Note**: AI operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.ai_ops_confirmation_threshold` at `version 1.42.0` so that the BigFrames will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", + "\n", + "The default threshold is 0, which means the operators will always ask for confirmations. You are free to adjust the value as needed. You can also set the threshold to `None` to disable this feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F4dZm4b7iouR" + }, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.42.0\"):\n", + " bigframes.options.compute.ai_ops_confirmation_threshold = 1000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_dEA3G9RiouR" + }, + "source": [ + "If you would like your operations to fail automatically when the data is too large, set `bigframes.options.compute.ai_ops_threshold_autofail` to `True`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BoUK-cpbiouS" + }, + "outputs": [], + "source": [ + "# if Version(bigframes.__version__) >= Version(\"1.42.0\"):\n", + "# bigframes.options.compute.ai_ops_threshold_autofail = True" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hQft3o3OiouS" + }, + "source": [ + "# API Samples" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dt5Kl-QGiouS" + }, + "source": [ + "You will learn about each AI operator by trying some examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J7XAT459iouS" + }, + "source": [ + "## AI Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9d5HUIvliouS" + }, + "source": [ + "AI filtering allows you to filter your dataframe based on the instruction (i.e. prompt) you provided.\n", + "\n", + "First, create a dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "id": "NDpCRGd_iouS", + "outputId": "5048c935-06d3-4ef1-ad87-72e14a30b1b7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycity
0USASeattle
1GermanyBerlin
2JapanKyoto
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " country city\n", + "0 USA Seattle\n", + "1 Germany Berlin\n", + "2 Japan Kyoto\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'country': ['USA', 'Germany', 'Japan'], 'city': ['Seattle', 'Berlin', 'Kyoto']})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6AXmT7sniouS" + }, + "source": [ + "Now, filter this dataframe by keeping only the rows where the value in `city` column is the capital of the value in `country` column. The column references could be \"escaped\" by using a pair of braces in your instruction. In this example, your instruction should be like this:\n", + "```\n", + "The {city} is the capital of the {country}.\n", + "```\n", + "\n", + "Note that this is not a Python f-string, so you shouldn't prefix your instruction with an `f`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 127 + }, + "id": "ipW3Z_l4iouS", + "outputId": "ad447459-225a-419c-d4c8-fedac4a9ed0f" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycity
1GermanyBerlin
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " country city\n", + "1 Germany Berlin\n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.filter(\"The {city} is the capital of the {country}\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "swKvgfm1iouS" + }, + "source": [ + "The filter operator extracts the information from the referenced column to enrich your instruction with context. The instruction is then sent for the designated model for evaluation. For filtering operations, the LLM is asked to return only `True` and `False` for each row, and the operator removes the rows accordingly." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r_2AAGGoiouS" + }, + "source": [ + "## AI Mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vT6skC57iouS" + }, + "source": [ + "AI mapping allows to you to combine values from multiple columns into a single output based your instruction.\n", + "\n", + "Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "id": "BQ7xeUK3iouS", + "outputId": "33dcb742-77ed-4bea-8dbc-1cf775102a25" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ingredient_1ingredient_2
0BunBeef Patty
1Soy BeanBittern
2SausageLong Bread
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " ingredient_1 ingredient_2\n", + "0 Bun Beef Patty\n", + "1 Soy Bean Bittern\n", + "2 Sausage Long Bread\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\n", + " \"ingredient_1\": [\"Bun\", \"Soy Bean\", \"Sausage\"],\n", + " \"ingredient_2\": [\"Beef Patty\", \"Bittern\", \"Long Bread\"]\n", + " })\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VFObP2aFiouS" + }, + "source": [ + "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the column name by setting the `output_column` parameter to hold the mapping results." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "id": "PpL24AQFiouS", + "outputId": "e7aff038-bf4b-4833-def8-fe2648e8885b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ingredient_1ingredient_2food
0BunBeef PattyBurger
1Soy BeanBitternTofu
2SausageLong BreadHotdog
\n", + "

3 rows × 3 columns

\n", + "
[3 rows x 3 columns in total]" + ], + "text/plain": [ + " ingredient_1 ingredient_2 food\n", + "0 Bun Beef Patty Burger\n", + "\n", + "1 Soy Bean Bittern Tofu\n", + "\n", + "2 Sausage Long Bread Hotdog\n", + "\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "70WTZZfdiouS" + }, + "source": [ + "## AI Joining" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u93uieRaiouS" + }, + "source": [ + "AI joining can join two dataframes based on the instruction you provided.\n", + "\n", + "First, you prepare two dataframes:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "dffIGEUEiouS" + }, + "outputs": [], + "source": [ + "cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})\n", + "continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hz0X-0RtiouS" + }, + "source": [ + "You want to join the `cities` with `continents` to form a new dataframe such that, in each row the city from the `cities` data frame is in the continent from the `continents` dataframe. You could re-use the aforementioned column reference syntax:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "WPIOHEwCiouT", + "outputId": "976586c3-b5db-4088-a46a-44dfbf822ecb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
citycontinent
0SeattleNorth America
1OttawaNorth America
2ShanghaiAsia
3New DelhiAsia
\n", + "

4 rows × 2 columns

\n", + "
[4 rows x 2 columns in total]" + ], + "text/plain": [ + " city continent\n", + "0 Seattle North America\n", + "1 Ottawa North America\n", + "2 Shanghai Asia\n", + "3 New Delhi Asia\n", + "\n", + "[4 rows x 2 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cities.ai.join(continents, \"{city} is in {continent}\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Qc97GMWiouT" + }, + "source": [ + "!! **Important:** AI join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes AI filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MUEJXT1IiouT" + }, + "source": [ + "### Self Joins" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QvX-nCogiouT" + }, + "source": [ + "This self-join example is for demonstrating a special case: what happens when the joining columns exist in both data frames? It turns out that you need to provide extra information in your column references: by attaching \"left.\" and \"right.\" prefixes to your column names.\n", + "\n", + "Create an example data frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "OIGz5sqxiouW" + }, + "outputs": [], + "source": [ + "animals = bpd.DataFrame({'animal': ['cow', 'cat', 'spider', 'elephant']})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VmJbuWNniouX" + }, + "source": [ + "You want to compare the weights of these animals, and output all the pairs where the animal on the left is heavier than the animal on the right. In this case, you use `left.animal` and `right.animal` to differentiate the data sources:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 284 + }, + "id": "UHfggdhBiouX", + "outputId": "a439e3aa-1382-4244-951f-127dc8da0fe3" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animal_leftanimal_right
0cowcat
1cowspider
2catspider
3elephantcow
4elephantcat
5elephantspider
\n", + "

6 rows × 2 columns

\n", + "
[6 rows x 2 columns in total]" + ], + "text/plain": [ + " animal_left animal_right\n", + "0 cow cat\n", + "1 cow spider\n", + "2 cat spider\n", + "3 elephant cow\n", + "4 elephant cat\n", + "5 elephant spider\n", + "\n", + "[6 rows x 2 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "animals.ai.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kU7BsyTyiouX" + }, + "source": [ + "## AI Top K" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s9QePXEoiouX" + }, + "source": [ + "AI Top K selects the top K values based on your instruction. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "bMQqtyZ2iouX" + }, + "outputs": [], + "source": [ + "df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KiljGBSCiouX" + }, + "source": [ + "You want to find the top two most popular pets:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 + }, + "id": "OZv5WUGIiouX", + "outputId": "ae1cee27-cc31-455e-c4ac-c0a9a5cf4ca5" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Animals
0Corgi
1Orange Cat
\n", + "

2 rows × 1 columns

\n", + "
[2 rows x 1 columns in total]" + ], + "text/plain": [ + " Animals\n", + "0 Corgi\n", + "1 Orange Cat\n", + "\n", + "[2 rows x 1 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dC8fyu3aiouX" + }, + "source": [ + "Under the hood, the AI top K operator performs pair-wise comparisons with LLM. The top K results are returned in the order of their indices instead of their ranks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sIszJ0zPiouX" + }, + "source": [ + "## AI Search" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e4ojHRKAiouX" + }, + "source": [ + "AI search searches the most similar values to your query within a single column. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + }, + "id": "gnQSIZ5SiouX", + "outputId": "dd6e1ecb-1bad-4a7c-8065-e56c697d0863" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
creatures
0salmon
1sea urchin
2baboons
3frog
4chimpanzee
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " creatures\n", + "0 salmon\n", + "1 sea urchin\n", + "2 baboons\n", + "3 frog\n", + "4 chimpanzee\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\"creatures\": [\"salmon\", \"sea urchin\", \"baboons\", \"frog\", \"chimpanzee\"]})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5apfIaZMiouX" + }, + "source": [ + "You want to get the top 2 creatures that are most similar to \"monkey\":" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 + }, + "id": "CkAuFgPYiouY", + "outputId": "723c7604-f53c-43d7-c754-4c91ec198dff" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
creaturessimilarity score
2baboons0.708434
4chimpanzee0.635844
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " creatures similarity score\n", + "2 baboons 0.708434\n", + "4 chimpanzee 0.635844\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.search(\"creatures\", query=\"monkey\", top_k = 2, model = text_embedding_model, score_column='similarity score')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GDZeVzFTiouY" + }, + "source": [ + "Note that you are using a text embedding model this time. This model generates embedding vectors for both your query as well as the values in the search space. The operator then uses BigQuery's built-in VECTOR_SEARCH function to find the nearest neighbors of your query.\n", + "\n", + "In addition, `score_column` is an optional parameter for storing the distances between the results and your query. If not set, the score column won't be attached to the result." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EXNutIXqiouY" + }, + "source": [ + "## AI Similarity Join" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BhWrhQMjiouY" + }, + "source": [ + "When you want to perform multiple similarity queries in the same value space, you could use similarity join to simplify your call. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "cUc7-8O6iouY" + }, + "outputs": [], + "source": [ + "df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})\n", + "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k96WerOviouY" + }, + "source": [ + "In this example, you want to pick the most related animal from `df2` for each value in `df1`." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + }, + "id": "wPV5EkfpiouY", + "outputId": "4be1211d-0353-4b94-8c27-ebd568e8e104" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalanimal_1distance
0monkeybaboon0.620521
1spiderscorpion0.728024
2salmontuna0.782141
3giraffeelephant0.7135
4sparrowowl0.810864
\n", + "

5 rows × 3 columns

\n", + "
[5 rows x 3 columns in total]" + ], + "text/plain": [ + " animal animal_1 distance\n", + "0 monkey baboon 0.620521\n", + "1 spider scorpion 0.728024\n", + "2 salmon tuna 0.782141\n", + "3 giraffe elephant 0.7135\n", + "4 sparrow owl 0.810864\n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.ai.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model=text_embedding_model, score_column='distance')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GplzD7v0iouY" + }, + "source": [ + "!! **Important** Like AI join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `bigframes.options.compute.sem_ops_confirmation_threshold` option to specify a threshold." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hgj8GoQhiouY" + }, + "source": [ + "# Performance Analyses" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EZomL0BciouY" + }, + "source": [ + "In this section, you will use BigQuery's public data of hacker news to perform some heavy work. We recommend you to check the code without executing them in order to save your time and money. The execution results are attached after each cell for your reference.\n", + "\n", + "First, load 3k rows from the table:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 880 + }, + "id": "wRR0SrcSiouY", + "outputId": "3b25f3a3-09c7-4396-9107-4aa4cdb4b963" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
0<NA><NA><NA><NA>2010-04-16 19:52:51+00:00comment
1<NA>I&#x27;d agree about border control with a cav...bandrami<NA>2023-06-04 06:12:00+00:00comment
2<NA>So 4 pickups? At least pickups are high margin...seanmcdirmid<NA>2023-09-19 14:19:46+00:00comment
3Workplace Wellness Programs Don’t Work Well. W...<NA>anarbadalov22018-08-07 12:17:45+00:00story
4<NA>Are you implying that to be a good developer y...ecesena<NA>2016-06-10 19:38:25+00:00comment
5<NA>It pretty much works with other carriers. My s...toast0<NA>2024-08-13 03:11:32+00:00comment
6<NA><NA><NA><NA>2020-06-07 22:43:03+00:00comment
7<NA>&quot;not operated for profit&quot; and &quot;...radford-neal<NA>2020-03-19 00:24:47+00:00comment
8<NA>It&#x27;s a good description of one applicatio...dkarl<NA>2024-10-07 13:38:18+00:00comment
9<NA>Might be a bit high, but....<p><i>&quot;For ex...tyingq<NA>2017-01-23 19:49:15+00:00comment
10Taiwan’s Tech King to Nancy Pelosi: U.S. Is in...<NA>dlcmh112023-02-18 02:51:11+00:00story
11Android’s new multitasking is terrible and sho...<NA>wowamit12018-10-22 09:50:36+00:00story
12<NA>SEEKING WORK | REMOTE | US Citizen<p>Location:...rasikjain<NA>2024-08-01 16:56:49+00:00comment
13<NA>I had a very similar experience last month tea...tmaly<NA>2020-01-22 18:26:36+00:00comment
14<NA><NA>mrtweetyhack<NA>2022-02-26 19:34:00+00:00comment
15<NA>&gt; Just do what most American cities do with...AnthonyMouse<NA>2021-10-04 23:10:50+00:00comment
16<NA>It&#x27;s not a space. The l and the C are at ...antninja<NA>2013-07-13 09:48:34+00:00comment
17<NA>I’ve knowingly paid the premium in the past, j...zwily<NA>2020-06-17 14:26:43+00:00comment
18<NA>&gt; Any sufficiently complicated C or Fortran...wavemode<NA>2025-02-07 06:42:53+00:00comment
19<NA>It&#x27;s similar to a lot of Japanese &quot;t...TillE<NA>2022-11-06 17:15:10+00:00comment
20<NA>Engineers are just people paid to code. If you...rchaud<NA>2023-04-12 14:31:42+00:00comment
21<NA>So don&#x27;t use itCyberDildonics<NA>2015-12-29 22:01:16+00:00comment
22<NA>Sure, but there are degrees of these things. T...dang<NA>2021-11-11 23:42:12+00:00comment
23<NA>I wish this would happen. There&#x27;s a &quo...coredog64<NA>2018-02-12 16:03:37+00:00comment
24<NA>I’m not sure why responsible riders wouldn’t w...mjmahone17<NA>2021-11-09 01:36:01+00:00comment
\n", + "

25 rows × 6 columns

\n", + "
[3000 rows x 6 columns in total]" + ], + "text/plain": [ + " title \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 Workplace Wellness Programs Don’t Work Well. W... \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 Taiwan’s Tech King to Nancy Pelosi: U.S. Is in... \n", + "11 Android’s new multitasking is terrible and sho... \n", + "12 \n", + "13 \n", + "14 \n", + "15 \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 \n", + "22 \n", + "23 \n", + "24 \n", + "\n", + " text by score \\\n", + "0 \n", + "1 I'd agree about border control with a cav... bandrami \n", + "2 So 4 pickups? At least pickups are high margin... seanmcdirmid \n", + "3 anarbadalov 2 \n", + "4 Are you implying that to be a good developer y... ecesena \n", + "5 It pretty much works with other carriers. My s... toast0 \n", + "6 \n", + "7 "not operated for profit" and "... radford-neal \n", + "8 It's a good description of one applicatio... dkarl \n", + "9 Might be a bit high, but....

"For ex... tyingq \n", + "10 dlcmh 11 \n", + "11 wowamit 1 \n", + "12 SEEKING WORK | REMOTE | US Citizen

Location:... rasikjain \n", + "13 I had a very similar experience last month tea... tmaly \n", + "14 mrtweetyhack \n", + "15 > Just do what most American cities do with... AnthonyMouse \n", + "16 It's not a space. The l and the C are at ... antninja \n", + "17 I’ve knowingly paid the premium in the past, j... zwily \n", + "18 > Any sufficiently complicated C or Fortran... wavemode \n", + "19 It's similar to a lot of Japanese "t... TillE \n", + "20 Engineers are just people paid to code. If you... rchaud \n", + "21 So don't use it CyberDildonics \n", + "22 Sure, but there are degrees of these things. T... dang \n", + "23 I wish this would happen. There's a &quo... coredog64 \n", + "24 I’m not sure why responsible riders wouldn’t w... mjmahone17 \n", + "\n", + " timestamp type \n", + "0 2010-04-16 19:52:51+00:00 comment \n", + "1 2023-06-04 06:12:00+00:00 comment \n", + "2 2023-09-19 14:19:46+00:00 comment \n", + "3 2018-08-07 12:17:45+00:00 story \n", + "4 2016-06-10 19:38:25+00:00 comment \n", + "5 2024-08-13 03:11:32+00:00 comment \n", + "6 2020-06-07 22:43:03+00:00 comment \n", + "7 2020-03-19 00:24:47+00:00 comment \n", + "8 2024-10-07 13:38:18+00:00 comment \n", + "9 2017-01-23 19:49:15+00:00 comment \n", + "10 2023-02-18 02:51:11+00:00 story \n", + "11 2018-10-22 09:50:36+00:00 story \n", + "12 2024-08-01 16:56:49+00:00 comment \n", + "13 2020-01-22 18:26:36+00:00 comment \n", + "14 2022-02-26 19:34:00+00:00 comment \n", + "15 2021-10-04 23:10:50+00:00 comment \n", + "16 2013-07-13 09:48:34+00:00 comment \n", + "17 2020-06-17 14:26:43+00:00 comment \n", + "18 2025-02-07 06:42:53+00:00 comment \n", + "19 2022-11-06 17:15:10+00:00 comment \n", + "20 2023-04-12 14:31:42+00:00 comment \n", + "21 2015-12-29 22:01:16+00:00 comment \n", + "22 2021-11-11 23:42:12+00:00 comment \n", + "23 2018-02-12 16:03:37+00:00 comment \n", + "24 2021-11-09 01:36:01+00:00 comment \n", + "...\n", + "\n", + "[3000 rows x 6 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", + "hacker_news" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3e94DPOdiouY" + }, + "source": [ + "Then, keep only the rows that have text content:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mQl8hc1biouY", + "outputId": "2b4ffa85-9d95-4a20-9040-0420c67da2d4" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2533" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news_with_texts = hacker_news[hacker_news['text'].isnull() == False]\n", + "len(hacker_news_with_texts)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWalDtLDiouZ" + }, + "source": [ + "You can get an idea of the input token length by calculating the average string length." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PZeg4LCUiouZ", + "outputId": "05b67cac-6b3d-42ef-d6d6-b578a9734f4c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "393.2356889064355" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news_with_texts['text'].str.len().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2IXqskHHiouZ" + }, + "source": [ + "**Optional**: You can raise the confirmation threshold for a smoother experience." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EpjXQ4FViouZ" + }, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.42.0\"):\n", + " bigframes.options.compute.ai_ops_confirmation_threshold = 5000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SYFB-X1RiouZ" + }, + "source": [ + "Now it's LLM's turn. You want to keep only the rows whose texts are talking about iPhone. This will take several minutes to finish." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + }, + "id": "rditQlmoiouZ", + "outputId": "2b44dcbf-2ef5-4119-ca05-9b082db9c0c1" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
445<NA>If I want to manipulate a device, I&#x27;ll bu...exelius<NA>2017-09-21 17:39:37+00:00comment
967<NA><a href=\"https:&#x2F;&#x2F;archive.ph&#x2F;nnE...blinding-streak<NA>2023-04-30 19:10:16+00:00comment
975<NA>I&#x27;ve had my 6S Plus now for 36 months and...throwaway427<NA>2019-01-03 18:06:33+00:00comment
1253<NA>Apple is far more closed and tyrannical with i...RyanMcGreal<NA>2012-12-21 00:45:40+00:00comment
1274<NA>An iOS version was released earlier this year....pls2halp<NA>2017-12-09 06:36:41+00:00comment
1548<NA>I’m not sure how that fits with Apple pursuing...alphabettsy<NA>2021-12-26 19:41:38+00:00comment
1630<NA>Not sure if you’re being ironic, but I use an ...lxgr<NA>2025-03-29 03:57:25+00:00comment
1664<NA>Quoting from the article I linked you:<p>&gt;&...StreamBright<NA>2017-09-11 19:57:34+00:00comment
1884<NA>&gt; Not all wireless headsets are the same, h...cptskippy<NA>2021-11-16 13:28:44+00:00comment
2251<NA>Will not buy any more apple product, iphone 4s...omi<NA>2012-09-11 14:42:52+00:00comment
2877<NA>I&#x27;ve been an iPhone user since the OG in ...vsnf<NA>2024-04-15 06:28:09+00:00comment
\n", + "

11 rows × 6 columns

\n", + "
[11 rows x 6 columns in total]" + ], + "text/plain": [ + " title text \\\n", + "445 If I want to manipulate a device, I'll bu... \n", + "967 I've had my 6S Plus now for 36 months and... \n", + "1253 Apple is far more closed and tyrannical with i... \n", + "1274 An iOS version was released earlier this year.... \n", + "1548 I’m not sure how that fits with Apple pursuing... \n", + "1630 Not sure if you’re being ironic, but I use an ... \n", + "1664 Quoting from the article I linked you:

>&... \n", + "1884 > Not all wireless headsets are the same, h... \n", + "2251 Will not buy any more apple product, iphone 4s... \n", + "2877 I've been an iPhone user since the OG in ... \n", + "\n", + " by score timestamp type \n", + "445 exelius 2017-09-21 17:39:37+00:00 comment \n", + "967 blinding-streak 2023-04-30 19:10:16+00:00 comment \n", + "975 throwaway427 2019-01-03 18:06:33+00:00 comment \n", + "1253 RyanMcGreal 2012-12-21 00:45:40+00:00 comment \n", + "1274 pls2halp 2017-12-09 06:36:41+00:00 comment \n", + "1548 alphabettsy 2021-12-26 19:41:38+00:00 comment \n", + "1630 lxgr 2025-03-29 03:57:25+00:00 comment \n", + "1664 StreamBright 2017-09-11 19:57:34+00:00 comment \n", + "1884 cptskippy 2021-11-16 13:28:44+00:00 comment \n", + "2251 omi 2012-09-11 14:42:52+00:00 comment \n", + "2877 vsnf 2024-04-15 06:28:09+00:00 comment \n", + "\n", + "[11 rows x 6 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iphone_comments = hacker_news_with_texts.ai.filter(\"The {text} is mainly focused on iPhone\", gemini_model)\n", + "iphone_comments" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yl24sJFIiouZ" + }, + "source": [ + "The performance of the ai operators depends on the length of your input as well as your quota. Here are our benchmarks for running the previous operation with Gemini Flash 1.5 over data of different sizes. Here are the estimates supposing your quota is [the default 200 requests per minute](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas):\n", + "\n", + "* 800 Rows -> ~4m\n", + "* 2550 Rows -> ~13m\n", + "* 8500 Rows -> ~40m\n", + "\n", + "These numbers can give you a general idea of how fast the operators run." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eo4nfISuiouZ" + }, + "source": [ + "Now, use LLM to summarize the sentiments towards iPhone:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + }, + "id": "IlKBrNxUiouZ", + "outputId": "818d01e4-1cdf-42a2-9e02-61c4736a8905" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptypesentiment
445<NA>If I want to manipulate a device, I&#x27;ll bu...exelius<NA>2017-09-21 17:39:37+00:00commentPragmatic, slightly annoyed
967<NA><a href=\"https:&#x2F;&#x2F;archive.ph&#x2F;nnE...blinding-streak<NA>2023-04-30 19:10:16+00:00commentI lack the ability to access external websites...
975<NA>I&#x27;ve had my 6S Plus now for 36 months and...throwaway427<NA>2019-01-03 18:06:33+00:00commentGenerally positive, impressed.
1253<NA>Apple is far more closed and tyrannical with i...RyanMcGreal<NA>2012-12-21 00:45:40+00:00commentNegative towards Apple
1274<NA>An iOS version was released earlier this year....pls2halp<NA>2017-12-09 06:36:41+00:00commentNeutral, factual statement.
1548<NA>I’m not sure how that fits with Apple pursuing...alphabettsy<NA>2021-12-26 19:41:38+00:00commentSkeptical and critical.
1630<NA>Not sure if you’re being ironic, but I use an ...lxgr<NA>2025-03-29 03:57:25+00:00commentWants interoperability, frustrated.
1664<NA>Quoting from the article I linked you:<p>&gt;&...StreamBright<NA>2017-09-11 19:57:34+00:00commentExtremely positive review
1884<NA>&gt; Not all wireless headsets are the same, h...cptskippy<NA>2021-11-16 13:28:44+00:00commentSkeptical and critical
2251<NA>Will not buy any more apple product, iphone 4s...omi<NA>2012-09-11 14:42:52+00:00commentNegative, regretful.
2877<NA>I&#x27;ve been an iPhone user since the OG in ...vsnf<NA>2024-04-15 06:28:09+00:00commentMildly annoyed, resigned
\n", + "

11 rows × 7 columns

\n", + "
[11 rows x 7 columns in total]" + ], + "text/plain": [ + " title text \\\n", + "445 If I want to manipulate a device, I'll bu... \n", + "967
I've had my 6S Plus now for 36 months and... \n", + "1253 Apple is far more closed and tyrannical with i... \n", + "1274 An iOS version was released earlier this year.... \n", + "1548 I’m not sure how that fits with Apple pursuing... \n", + "1630 Not sure if you’re being ironic, but I use an ... \n", + "1664 Quoting from the article I linked you:

>&... \n", + "1884 > Not all wireless headsets are the same, h... \n", + "2251 Will not buy any more apple product, iphone 4s... \n", + "2877 I've been an iPhone user since the OG in ... \n", + "\n", + " by score timestamp type \\\n", + "445 exelius 2017-09-21 17:39:37+00:00 comment \n", + "967 blinding-streak 2023-04-30 19:10:16+00:00 comment \n", + "975 throwaway427 2019-01-03 18:06:33+00:00 comment \n", + "1253 RyanMcGreal 2012-12-21 00:45:40+00:00 comment \n", + "1274 pls2halp 2017-12-09 06:36:41+00:00 comment \n", + "1548 alphabettsy 2021-12-26 19:41:38+00:00 comment \n", + "1630 lxgr 2025-03-29 03:57:25+00:00 comment \n", + "1664 StreamBright 2017-09-11 19:57:34+00:00 comment \n", + "1884 cptskippy 2021-11-16 13:28:44+00:00 comment \n", + "2251 omi 2012-09-11 14:42:52+00:00 comment \n", + "2877 vsnf 2024-04-15 06:28:09+00:00 comment \n", + "\n", + " sentiment \n", + "445 Pragmatic, slightly annoyed\n", + " \n", + "967 I lack the ability to access external websites... \n", + "975 Generally positive, impressed.\n", + " \n", + "1253 Negative towards Apple\n", + " \n", + "1274 Neutral, factual statement.\n", + " \n", + "1548 Skeptical and critical.\n", + " \n", + "1630 Wants interoperability, frustrated.\n", + " \n", + "1664 Extremely positive review\n", + " \n", + "1884 Skeptical and critical\n", + " \n", + "2251 Negative, regretful.\n", + " \n", + "2877 Mildly annoyed, resigned\n", + " \n", + "\n", + "[11 rows x 7 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iphone_comments.ai.map(\"Summarize the sentiment of the {text}. Your answer should have at most 3 words\", output_column=\"sentiment\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y7_16T2xiouZ" + }, + "source": [ + "Here is another example: count the number of rows whose authors have animals in their names." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 880 + }, + "id": "CbGwc_uXiouZ", + "outputId": "138acca0-7fb9-495a-e797-0d42495d65e6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2025-04-02 18:00:55.801294+00:00 to avoid\n", + "incompatibilies with previous reads of this table. To read the latest\n", + "version, set `use_cache=False` or close the current session with\n", + "Session.close() or bigframes.pandas.close_session().\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + }, + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
0<NA><NA><NA><NA>2010-04-16 19:52:51+00:00comment
1<NA>I&#x27;d agree about border control with a cav...bandrami<NA>2023-06-04 06:12:00+00:00comment
2<NA>So 4 pickups? At least pickups are high margin...seanmcdirmid<NA>2023-09-19 14:19:46+00:00comment
3Workplace Wellness Programs Don’t Work Well. W...<NA>anarbadalov22018-08-07 12:17:45+00:00story
4<NA>Are you implying that to be a good developer y...ecesena<NA>2016-06-10 19:38:25+00:00comment
5<NA>It pretty much works with other carriers. My s...toast0<NA>2024-08-13 03:11:32+00:00comment
6<NA><NA><NA><NA>2020-06-07 22:43:03+00:00comment
7<NA>&quot;not operated for profit&quot; and &quot;...radford-neal<NA>2020-03-19 00:24:47+00:00comment
8<NA>It&#x27;s a good description of one applicatio...dkarl<NA>2024-10-07 13:38:18+00:00comment
9<NA>Might be a bit high, but....<p><i>&quot;For ex...tyingq<NA>2017-01-23 19:49:15+00:00comment
10Taiwan’s Tech King to Nancy Pelosi: U.S. Is in...<NA>dlcmh112023-02-18 02:51:11+00:00story
11Android’s new multitasking is terrible and sho...<NA>wowamit12018-10-22 09:50:36+00:00story
12<NA>SEEKING WORK | REMOTE | US Citizen<p>Location:...rasikjain<NA>2024-08-01 16:56:49+00:00comment
13<NA>I had a very similar experience last month tea...tmaly<NA>2020-01-22 18:26:36+00:00comment
14<NA><NA>mrtweetyhack<NA>2022-02-26 19:34:00+00:00comment
15<NA>&gt; Just do what most American cities do with...AnthonyMouse<NA>2021-10-04 23:10:50+00:00comment
16<NA>It&#x27;s not a space. The l and the C are at ...antninja<NA>2013-07-13 09:48:34+00:00comment
17<NA>I’ve knowingly paid the premium in the past, j...zwily<NA>2020-06-17 14:26:43+00:00comment
18<NA>&gt; Any sufficiently complicated C or Fortran...wavemode<NA>2025-02-07 06:42:53+00:00comment
19<NA>It&#x27;s similar to a lot of Japanese &quot;t...TillE<NA>2022-11-06 17:15:10+00:00comment
20<NA>Engineers are just people paid to code. If you...rchaud<NA>2023-04-12 14:31:42+00:00comment
21<NA>So don&#x27;t use itCyberDildonics<NA>2015-12-29 22:01:16+00:00comment
22<NA>Sure, but there are degrees of these things. T...dang<NA>2021-11-11 23:42:12+00:00comment
23<NA>I wish this would happen. There&#x27;s a &quo...coredog64<NA>2018-02-12 16:03:37+00:00comment
24<NA>I’m not sure why responsible riders wouldn’t w...mjmahone17<NA>2021-11-09 01:36:01+00:00comment
\n", + "

25 rows × 6 columns

\n", + "
[3000 rows x 6 columns in total]" + ], + "text/plain": [ + " title \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 Workplace Wellness Programs Don’t Work Well. W... \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 Taiwan’s Tech King to Nancy Pelosi: U.S. Is in... \n", + "11 Android’s new multitasking is terrible and sho... \n", + "12 \n", + "13 \n", + "14 \n", + "15 \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 \n", + "22 \n", + "23 \n", + "24 \n", + "\n", + " text by score \\\n", + "0 \n", + "1 I'd agree about border control with a cav... bandrami \n", + "2 So 4 pickups? At least pickups are high margin... seanmcdirmid \n", + "3 anarbadalov 2 \n", + "4 Are you implying that to be a good developer y... ecesena \n", + "5 It pretty much works with other carriers. My s... toast0 \n", + "6 \n", + "7 "not operated for profit" and "... radford-neal \n", + "8 It's a good description of one applicatio... dkarl \n", + "9 Might be a bit high, but....

"For ex... tyingq \n", + "10 dlcmh 11 \n", + "11 wowamit 1 \n", + "12 SEEKING WORK | REMOTE | US Citizen

Location:... rasikjain \n", + "13 I had a very similar experience last month tea... tmaly \n", + "14 mrtweetyhack \n", + "15 > Just do what most American cities do with... AnthonyMouse \n", + "16 It's not a space. The l and the C are at ... antninja \n", + "17 I’ve knowingly paid the premium in the past, j... zwily \n", + "18 > Any sufficiently complicated C or Fortran... wavemode \n", + "19 It's similar to a lot of Japanese "t... TillE \n", + "20 Engineers are just people paid to code. If you... rchaud \n", + "21 So don't use it CyberDildonics \n", + "22 Sure, but there are degrees of these things. T... dang \n", + "23 I wish this would happen. There's a &quo... coredog64 \n", + "24 I’m not sure why responsible riders wouldn’t w... mjmahone17 \n", + "\n", + " timestamp type \n", + "0 2010-04-16 19:52:51+00:00 comment \n", + "1 2023-06-04 06:12:00+00:00 comment \n", + "2 2023-09-19 14:19:46+00:00 comment \n", + "3 2018-08-07 12:17:45+00:00 story \n", + "4 2016-06-10 19:38:25+00:00 comment \n", + "5 2024-08-13 03:11:32+00:00 comment \n", + "6 2020-06-07 22:43:03+00:00 comment \n", + "7 2020-03-19 00:24:47+00:00 comment \n", + "8 2024-10-07 13:38:18+00:00 comment \n", + "9 2017-01-23 19:49:15+00:00 comment \n", + "10 2023-02-18 02:51:11+00:00 story \n", + "11 2018-10-22 09:50:36+00:00 story \n", + "12 2024-08-01 16:56:49+00:00 comment \n", + "13 2020-01-22 18:26:36+00:00 comment \n", + "14 2022-02-26 19:34:00+00:00 comment \n", + "15 2021-10-04 23:10:50+00:00 comment \n", + "16 2013-07-13 09:48:34+00:00 comment \n", + "17 2020-06-17 14:26:43+00:00 comment \n", + "18 2025-02-07 06:42:53+00:00 comment \n", + "19 2022-11-06 17:15:10+00:00 comment \n", + "20 2023-04-12 14:31:42+00:00 comment \n", + "21 2015-12-29 22:01:16+00:00 comment \n", + "22 2021-11-11 23:42:12+00:00 comment \n", + "23 2018-02-12 16:03:37+00:00 comment \n", + "24 2021-11-09 01:36:01+00:00 comment \n", + "...\n", + "\n", + "[3000 rows x 6 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", + "hacker_news" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 880 + }, + "id": "9dzU8SNziouZ", + "outputId": "da8815c1-c411-4afc-d1ca-5e44c75b5b48" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
15<NA>&gt; Just do what most American cities do with...AnthonyMouse<NA>2021-10-04 23:10:50+00:00comment
16<NA>It&#x27;s not a space. The l and the C are at ...antninja<NA>2013-07-13 09:48:34+00:00comment
23<NA>I wish this would happen. There&#x27;s a &quo...coredog64<NA>2018-02-12 16:03:37+00:00comment
27<NA>Flash got close, but was too complex and expen...surfingdino<NA>2024-05-08 05:02:37+00:00comment
36<NA>I think the &quot;algo genius&quot; type of de...poisonborz<NA>2024-06-04 07:39:08+00:00comment
150<NA>No one will be doing anything practical with a...NeutralCrane<NA>2025-02-01 14:26:25+00:00comment
160<NA>I think this is more semantics than anything.<...superb-owl<NA>2022-06-08 16:55:54+00:00comment
205<NA>Interesting to think of sign language localisa...robin_reala<NA>2019-02-01 11:49:23+00:00comment
231<NA>Probably because of their key location.ape4<NA>2014-08-29 14:55:40+00:00comment
250<NA>I realize this is a bit passe, but there were ...FeepingCreature<NA>2023-10-15 11:32:44+00:00comment
320Protest against Bill C-11, Canada's SOPA, plan...<NA>magikarp12012-01-29 02:14:12+00:00story
344<NA>What? Are you suggesting we cannot criticize p...chickenpotpie<NA>2020-12-02 18:24:19+00:00comment
348The flu vaccine this year is only 10% effective<NA>maryfoxmarlow32018-02-02 02:19:42+00:00story
360<NA>Bomb ownership is okay AFAIK. Intent to commi...Ferret7446<NA>2023-06-25 20:04:30+00:00comment
3981 + 1 = 3<NA>oscar-the-horse22012-08-05 22:18:51+00:00story
407<NA>No (almost certainly), but you will become fru...AnimalMuppet<NA>2023-09-15 16:11:08+00:00comment
454<NA>48h is less than 5 kWh of batteries, one quart...tigershark<NA>2021-07-23 05:12:52+00:00comment
457Brazilian Rails Websites<NA>akitaonrails12008-07-27 17:27:47+00:00story
472<NA>&gt; When most people start as programmers, th...PavlovsCat<NA>2018-12-23 20:37:20+00:00comment
493<NA>Related anecdata + a study I found useful. Aft...TrainedMonkey<NA>2023-02-02 16:14:23+00:00comment
497<NA>That &quot;civilized&quot; country has too man...rantanplan<NA>2017-02-17 12:51:51+00:00comment
514<NA>The current Go 2 drafts do.tapirl<NA>2020-08-12 02:37:41+00:00comment
535<NA>Having walked this same path, this blog resona...curiousllama<NA>2020-10-07 20:35:18+00:00comment
607<NA>If people thought the reward for talking to a ...slapfrog<NA>2021-09-08 20:58:13+00:00comment
672<NA>Given that you say you&#x27;re 38 and looking ...strix_varius<NA>2023-08-04 02:41:50+00:00comment
\n", + "

25 rows × 6 columns

\n", + "
[112 rows x 6 columns in total]" + ], + "text/plain": [ + " title \\\n", + "15 \n", + "16 \n", + "23 \n", + "27 \n", + "36 \n", + "150 \n", + "160 \n", + "205 \n", + "231 \n", + "250 \n", + "320 Protest against Bill C-11, Canada's SOPA, plan... \n", + "344 \n", + "348 The flu vaccine this year is only 10% effective \n", + "360 \n", + "398 1 + 1 = 3 \n", + "407 \n", + "454 \n", + "457 Brazilian Rails Websites \n", + "472 \n", + "493 \n", + "497 \n", + "514 \n", + "535 \n", + "607 \n", + "672 \n", + "\n", + " text by \\\n", + "15 > Just do what most American cities do with... AnthonyMouse \n", + "16 It's not a space. The l and the C are at ... antninja \n", + "23 I wish this would happen. There's a &quo... coredog64 \n", + "27 Flash got close, but was too complex and expen... surfingdino \n", + "36 I think the "algo genius" type of de... poisonborz \n", + "150 No one will be doing anything practical with a... NeutralCrane \n", + "160 I think this is more semantics than anything.<... superb-owl \n", + "205 Interesting to think of sign language localisa... robin_reala \n", + "231 Probably because of their key location. ape4 \n", + "250 I realize this is a bit passe, but there were ... FeepingCreature \n", + "320 magikarp \n", + "344 What? Are you suggesting we cannot criticize p... chickenpotpie \n", + "348 maryfoxmarlow \n", + "360 Bomb ownership is okay AFAIK. Intent to commi... Ferret7446 \n", + "398 oscar-the-horse \n", + "407 No (almost certainly), but you will become fru... AnimalMuppet \n", + "454 48h is less than 5 kWh of batteries, one quart... tigershark \n", + "457 akitaonrails \n", + "472 > When most people start as programmers, th... PavlovsCat \n", + "493 Related anecdata + a study I found useful. Aft... TrainedMonkey \n", + "497 That "civilized" country has too man... rantanplan \n", + "514 The current Go 2 drafts do. tapirl \n", + "535 Having walked this same path, this blog resona... curiousllama \n", + "607 If people thought the reward for talking to a ... slapfrog \n", + "672 Given that you say you're 38 and looking ... strix_varius \n", + "\n", + " score timestamp type \n", + "15 2021-10-04 23:10:50+00:00 comment \n", + "16 2013-07-13 09:48:34+00:00 comment \n", + "23 2018-02-12 16:03:37+00:00 comment \n", + "27 2024-05-08 05:02:37+00:00 comment \n", + "36 2024-06-04 07:39:08+00:00 comment \n", + "150 2025-02-01 14:26:25+00:00 comment \n", + "160 2022-06-08 16:55:54+00:00 comment \n", + "205 2019-02-01 11:49:23+00:00 comment \n", + "231 2014-08-29 14:55:40+00:00 comment \n", + "250 2023-10-15 11:32:44+00:00 comment \n", + "320 1 2012-01-29 02:14:12+00:00 story \n", + "344 2020-12-02 18:24:19+00:00 comment \n", + "348 3 2018-02-02 02:19:42+00:00 story \n", + "360 2023-06-25 20:04:30+00:00 comment \n", + "398 2 2012-08-05 22:18:51+00:00 story \n", + "407 2023-09-15 16:11:08+00:00 comment \n", + "454 2021-07-23 05:12:52+00:00 comment \n", + "457 1 2008-07-27 17:27:47+00:00 story \n", + "472 2018-12-23 20:37:20+00:00 comment \n", + "493 2023-02-02 16:14:23+00:00 comment \n", + "497 2017-02-17 12:51:51+00:00 comment \n", + "514 2020-08-12 02:37:41+00:00 comment \n", + "535 2020-10-07 20:35:18+00:00 comment \n", + "607 2021-09-08 20:58:13+00:00 comment \n", + "672 2023-08-04 02:41:50+00:00 comment \n", + "...\n", + "\n", + "[112 rows x 6 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news.ai.filter(\"{by} contains animal name\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3bpkaspoiouZ" + }, + "source": [ + "Here are the runtime numbers with 500 requests per minute [raised quota](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas):\n", + "* 3000 rows -> ~6m\n", + "* 10000 rows -> ~26m" + ] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/experimental/longer_ml_demo.ipynb b/notebooks/experimental/longer_ml_demo.ipynb deleted file mode 100644 index 793ff58ecd..0000000000 --- a/notebooks/experimental/longer_ml_demo.ipynb +++ /dev/null @@ -1,1925 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "71fbfc47", - "metadata": {}, - "source": [ - "**Note: this notebook requires changes not yet checked in**\n", - "\n", - "# Introduction\n", - "\n", - "This is a prototype for how a minimal SKLearn-like wrapper for BQML might work in BigQuery DataFrames.\n", - "\n", - "Disclaimer - this is not a polished design or a robust implementation, this is a quick prototype to workshop some ideas. Design will be next.\n", - "\n", - "What is BigQuery DataFrame?\n", - "- Pandas API for BigQuery\n", - "- Lets data scientists quickly iterate and prepare their data as they do in Pandas, but executed by BigQuery\n", - "\n", - "What is meant by SKLearn-like?\n", - "- Follow the API design practices from the SKLearn project\n", - " - [API design for machine learning software: experiences from the scikit-learn project](https://arxiv.org/pdf/1309.0238.pdf)\n", - "- Not a copy of, or compatible with, SKLearn\n", - "\n", - "Briefly, patterns taken from SKLearn are:\n", - "- Models and transforms are 'Estimators'\n", - " - A bundle of parameters with a consistent way to initialize/get/set\n", - " - And a .fit(..) method to fit to training data\n", - "- Models additionally have a .predict(..)\n", - "- By default, these objects are transient, making them easy to play around with. No need to give them names or decide how to persist them.\n", - "\n", - "\n", - "Design goals:\n", - "- Zero friction ML capabilities for BigQuery DataFrames users (no extra auth, configuration, etc)\n", - "- Offers first class integration with the Pandas-like BigQuery DataFrames API\n", - "- Uses SKLearn-like design patterns that feel familiar to data scientists\n", - "- Also a first class BigQuery experience\n", - " - Offers BigQuery's scalability and storage / compute management\n", - " - Works naturally with BigQuery's other interfaces, e.g. GUI and SQL\n", - " - BQML features" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "345c2163", - "metadata": {}, - "source": [ - "# Linear regression tutorial\n", - "\n", - "Adapted from the \"Penguin weight\" Linear Regression tutorial for BQML: https://cloud.google.com/bigquery-ml/docs/linear-regression-tutorial\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "03c9e168", - "metadata": {}, - "source": [ - "## Setting the scene\n", - "\n", - "Our conservationists have sent us some measurements of penguins found in the Antarctic islands. They say that some of the body mass measurements for the Adelie penguins are missing, and ask if we can use some data science magic to estimate them. Sounds like a job for a linear regression!\n", - "\n", - "Lets take a look at the data..." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "d7a03de2-c0ef-4f80-9cd5-f96e87cf2d54", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tag_numberspeciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
01225Gentoo penguin (Pygoscelis papua)Biscoe<NA><NA><NA><NA><NA>
11278Gentoo penguin (Pygoscelis papua)Biscoe42.013.5210.04150.0FEMALE
21275Gentoo penguin (Pygoscelis papua)Biscoe46.513.5210.04550.0FEMALE
31233Gentoo penguin (Pygoscelis papua)Biscoe43.314.0208.04575.0FEMALE
41311Gentoo penguin (Pygoscelis papua)Biscoe47.514.0212.04875.0FEMALE
51316Gentoo penguin (Pygoscelis papua)Biscoe49.114.5212.04625.0FEMALE
61313Gentoo penguin (Pygoscelis papua)Biscoe45.514.5212.04750.0FEMALE
71381Gentoo penguin (Pygoscelis papua)Biscoe47.614.5215.05400.0MALE
81377Gentoo penguin (Pygoscelis papua)Biscoe45.114.5207.05050.0FEMALE
91380Gentoo penguin (Pygoscelis papua)Biscoe45.114.5215.05000.0FEMALE
101257Gentoo penguin (Pygoscelis papua)Biscoe46.214.5209.04800.0FEMALE
111336Gentoo penguin (Pygoscelis papua)Biscoe46.514.5213.04400.0FEMALE
121237Gentoo penguin (Pygoscelis papua)Biscoe43.214.5208.04450.0FEMALE
131302Gentoo penguin (Pygoscelis papua)Biscoe48.515.0219.04850.0FEMALE
141325Gentoo penguin (Pygoscelis papua)Biscoe49.115.0228.05500.0MALE
151285Gentoo penguin (Pygoscelis papua)Biscoe47.515.0218.04950.0FEMALE
161242Gentoo penguin (Pygoscelis papua)Biscoe49.615.0216.04750.0MALE
171246Gentoo penguin (Pygoscelis papua)Biscoe47.715.0216.04750.0FEMALE
181320Gentoo penguin (Pygoscelis papua)Biscoe45.515.0220.05000.0MALE
191244Gentoo penguin (Pygoscelis papua)Biscoe46.415.0216.04700.0FEMALE
\n", - "
[347 rows x 8 columns in total]" - ], - "text/plain": [ - " tag_number species island culmen_length_mm \\\n", - "0 1225 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "1 1278 Gentoo penguin (Pygoscelis papua) Biscoe 42.0 \n", - "2 1275 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "3 1233 Gentoo penguin (Pygoscelis papua) Biscoe 43.3 \n", - "4 1311 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", - "5 1316 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "6 1313 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "7 1381 Gentoo penguin (Pygoscelis papua) Biscoe 47.6 \n", - "8 1377 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "9 1380 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "10 1257 Gentoo penguin (Pygoscelis papua) Biscoe 46.2 \n", - "11 1336 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "12 1237 Gentoo penguin (Pygoscelis papua) Biscoe 43.2 \n", - "13 1302 Gentoo penguin (Pygoscelis papua) Biscoe 48.5 \n", - "14 1325 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "15 1285 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", - "16 1242 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", - "17 1246 Gentoo penguin (Pygoscelis papua) Biscoe 47.7 \n", - "18 1320 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "19 1244 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 \n", - "20 1390 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", - "21 1379 Gentoo penguin (Pygoscelis papua) Biscoe 47.8 \n", - "22 1267 Gentoo penguin (Pygoscelis papua) Biscoe 50.1 \n", - "23 1389 Gentoo penguin (Pygoscelis papua) Biscoe 47.2 \n", - "24 1269 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 \n", - "1 13.5 210.0 4150.0 FEMALE \n", - "2 13.5 210.0 4550.0 FEMALE \n", - "3 14.0 208.0 4575.0 FEMALE \n", - "4 14.0 212.0 4875.0 FEMALE \n", - "5 14.5 212.0 4625.0 FEMALE \n", - "6 14.5 212.0 4750.0 FEMALE \n", - "7 14.5 215.0 5400.0 MALE \n", - "8 14.5 207.0 5050.0 FEMALE \n", - "9 14.5 215.0 5000.0 FEMALE \n", - "10 14.5 209.0 4800.0 FEMALE \n", - "11 14.5 213.0 4400.0 FEMALE \n", - "12 14.5 208.0 4450.0 FEMALE \n", - "13 15.0 219.0 4850.0 FEMALE \n", - "14 15.0 228.0 5500.0 MALE \n", - "15 15.0 218.0 4950.0 FEMALE \n", - "16 15.0 216.0 4750.0 MALE \n", - "17 15.0 216.0 4750.0 FEMALE \n", - "18 15.0 220.0 5000.0 MALE \n", - "19 15.0 216.0 4700.0 FEMALE \n", - "20 15.0 223.0 5550.0 MALE \n", - "21 15.0 215.0 5650.0 MALE \n", - "22 15.0 225.0 5000.0 MALE \n", - "23 15.5 215.0 4975.0 FEMALE \n", - "24 16.0 225.0 5700.0 MALE \n", - "...\n", - "\n", - "[347 rows x 8 columns]" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import bigframes.pandas\n", - "\n", - "df = bigframes.pandas.read_gbq(\"bigframes-dev.bqml_tutorial.penguins\")\n", - "df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "359524c4", - "metadata": {}, - "source": [ - "First we note that while we have a default numbered index generated by BigQuery, actually the penguins are uniquely identified by their tags.\n", - "\n", - "Lets make the data a bit friendlier to work with by setting the tag number column as the index." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "93d01411", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
tag_number
1225Gentoo penguin (Pygoscelis papua)Biscoe<NA><NA><NA><NA><NA>
1278Gentoo penguin (Pygoscelis papua)Biscoe42.013.5210.04150.0FEMALE
1275Gentoo penguin (Pygoscelis papua)Biscoe46.513.5210.04550.0FEMALE
1233Gentoo penguin (Pygoscelis papua)Biscoe43.314.0208.04575.0FEMALE
1311Gentoo penguin (Pygoscelis papua)Biscoe47.514.0212.04875.0FEMALE
1316Gentoo penguin (Pygoscelis papua)Biscoe49.114.5212.04625.0FEMALE
1313Gentoo penguin (Pygoscelis papua)Biscoe45.514.5212.04750.0FEMALE
1381Gentoo penguin (Pygoscelis papua)Biscoe47.614.5215.05400.0MALE
1377Gentoo penguin (Pygoscelis papua)Biscoe45.114.5207.05050.0FEMALE
1380Gentoo penguin (Pygoscelis papua)Biscoe45.114.5215.05000.0FEMALE
1257Gentoo penguin (Pygoscelis papua)Biscoe46.214.5209.04800.0FEMALE
1336Gentoo penguin (Pygoscelis papua)Biscoe46.514.5213.04400.0FEMALE
1237Gentoo penguin (Pygoscelis papua)Biscoe43.214.5208.04450.0FEMALE
1302Gentoo penguin (Pygoscelis papua)Biscoe48.515.0219.04850.0FEMALE
1325Gentoo penguin (Pygoscelis papua)Biscoe49.115.0228.05500.0MALE
1285Gentoo penguin (Pygoscelis papua)Biscoe47.515.0218.04950.0FEMALE
1242Gentoo penguin (Pygoscelis papua)Biscoe49.615.0216.04750.0MALE
1246Gentoo penguin (Pygoscelis papua)Biscoe47.715.0216.04750.0FEMALE
1320Gentoo penguin (Pygoscelis papua)Biscoe45.515.0220.05000.0MALE
1244Gentoo penguin (Pygoscelis papua)Biscoe46.415.0216.04700.0FEMALE
\n", - "
[347 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "tag_number \n", - "1225 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "1278 Gentoo penguin (Pygoscelis papua) Biscoe 42.0 \n", - "1275 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "1233 Gentoo penguin (Pygoscelis papua) Biscoe 43.3 \n", - "1311 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", - "1316 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "1313 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "1381 Gentoo penguin (Pygoscelis papua) Biscoe 47.6 \n", - "1377 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "1380 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "1257 Gentoo penguin (Pygoscelis papua) Biscoe 46.2 \n", - "1336 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "1237 Gentoo penguin (Pygoscelis papua) Biscoe 43.2 \n", - "1302 Gentoo penguin (Pygoscelis papua) Biscoe 48.5 \n", - "1325 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "1285 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", - "1242 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", - "1246 Gentoo penguin (Pygoscelis papua) Biscoe 47.7 \n", - "1320 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "1244 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 \n", - "1390 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", - "1379 Gentoo penguin (Pygoscelis papua) Biscoe 47.8 \n", - "1267 Gentoo penguin (Pygoscelis papua) Biscoe 50.1 \n", - "1389 Gentoo penguin (Pygoscelis papua) Biscoe 47.2 \n", - "1269 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "tag_number \n", - "1225 \n", - "1278 13.5 210.0 4150.0 FEMALE \n", - "1275 13.5 210.0 4550.0 FEMALE \n", - "1233 14.0 208.0 4575.0 FEMALE \n", - "1311 14.0 212.0 4875.0 FEMALE \n", - "1316 14.5 212.0 4625.0 FEMALE \n", - "1313 14.5 212.0 4750.0 FEMALE \n", - "1381 14.5 215.0 5400.0 MALE \n", - "1377 14.5 207.0 5050.0 FEMALE \n", - "1380 14.5 215.0 5000.0 FEMALE \n", - "1257 14.5 209.0 4800.0 FEMALE \n", - "1336 14.5 213.0 4400.0 FEMALE \n", - "1237 14.5 208.0 4450.0 FEMALE \n", - "1302 15.0 219.0 4850.0 FEMALE \n", - "1325 15.0 228.0 5500.0 MALE \n", - "1285 15.0 218.0 4950.0 FEMALE \n", - "1242 15.0 216.0 4750.0 MALE \n", - "1246 15.0 216.0 4750.0 FEMALE \n", - "1320 15.0 220.0 5000.0 MALE \n", - "1244 15.0 216.0 4700.0 FEMALE \n", - "1390 15.0 223.0 5550.0 MALE \n", - "1379 15.0 215.0 5650.0 MALE \n", - "1267 15.0 225.0 5000.0 MALE \n", - "1389 15.5 215.0 4975.0 FEMALE \n", - "1269 16.0 225.0 5700.0 MALE \n", - "...\n", - "\n", - "[347 rows x 7 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.set_index(\"tag_number\")\n", - "df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f95fda6a", - "metadata": {}, - "source": [ - "We saw in the first view that there were some missing values. We're especially interested in observations that are missing just the body_mass_g, so lets look at those:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "941cb6c3-8c54-42ce-a945-4fa604176b2e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
tag_number
1225Gentoo penguin (Pygoscelis papua)Biscoe<NA><NA><NA><NA><NA>
1393Adelie Penguin (Pygoscelis adeliae)Torgersen<NA><NA><NA><NA><NA>
1524Adelie Penguin (Pygoscelis adeliae)Dream41.620.0204.0<NA>MALE
1523Adelie Penguin (Pygoscelis adeliae)Dream38.017.5194.0<NA>FEMALE
1525Adelie Penguin (Pygoscelis adeliae)Dream36.318.5194.0<NA>MALE
\n", - "
[5 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "tag_number \n", - "1225 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "1393 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", - "1524 Adelie Penguin (Pygoscelis adeliae) Dream 41.6 \n", - "1523 Adelie Penguin (Pygoscelis adeliae) Dream 38.0 \n", - "1525 Adelie Penguin (Pygoscelis adeliae) Dream 36.3 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "tag_number \n", - "1225 \n", - "1393 \n", - "1524 20.0 204.0 MALE \n", - "1523 17.5 194.0 FEMALE \n", - "1525 18.5 194.0 MALE \n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.body_mass_g.isnull()]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "a70c2027", - "metadata": {}, - "source": [ - "Here we see three Adelie penguins with tag numbers 1523, 1524, 1525 are missing their body_mass_g but have the other measurements. These are the ones we need to guess. We can do this by training a statistical model on the measurements that we do have, and then using it to predict the missing values.\n", - "\n", - "Our conservationists warned us that trying to generalize across species is a bad idea, so for now lets just try building a model for Adelie penguins. We can revisit it later and see if including the other observations improves the model performance." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "93ff013a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
tag_number
1172Adelie Penguin (Pygoscelis adeliae)Dream32.115.5188.03050.0FEMALE
1371Adelie Penguin (Pygoscelis adeliae)Biscoe37.716.0183.03075.0FEMALE
1417Adelie Penguin (Pygoscelis adeliae)Torgersen38.617.0188.02900.0FEMALE
1204Adelie Penguin (Pygoscelis adeliae)Dream40.717.0190.03725.0MALE
1251Adelie Penguin (Pygoscelis adeliae)Biscoe37.617.0185.03600.0FEMALE
1422Adelie Penguin (Pygoscelis adeliae)Torgersen35.717.0189.03350.0FEMALE
1394Adelie Penguin (Pygoscelis adeliae)Torgersen40.217.0176.03450.0FEMALE
1163Adelie Penguin (Pygoscelis adeliae)Dream36.417.0195.03325.0FEMALE
1329Adelie Penguin (Pygoscelis adeliae)Biscoe38.117.0181.03175.0FEMALE
1406Adelie Penguin (Pygoscelis adeliae)Torgersen44.118.0210.04000.0MALE
1196Adelie Penguin (Pygoscelis adeliae)Dream36.518.0182.03150.0FEMALE
1228Adelie Penguin (Pygoscelis adeliae)Biscoe41.618.0192.03950.0MALE
1412Adelie Penguin (Pygoscelis adeliae)Torgersen40.318.0195.03250.0FEMALE
1142Adelie Penguin (Pygoscelis adeliae)Dream35.718.0202.03550.0FEMALE
1430Adelie Penguin (Pygoscelis adeliae)Torgersen33.519.0190.03600.0FEMALE
1333Adelie Penguin (Pygoscelis adeliae)Biscoe43.219.0197.04775.0MALE
1414Adelie Penguin (Pygoscelis adeliae)Torgersen38.719.0195.03450.0FEMALE
1197Adelie Penguin (Pygoscelis adeliae)Dream41.119.0182.03425.0MALE
1443Adelie Penguin (Pygoscelis adeliae)Torgersen40.619.0199.04000.0MALE
1295Adelie Penguin (Pygoscelis adeliae)Biscoe41.020.0203.04725.0MALE
\n", - "
[146 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "tag_number \n", - "1172 Adelie Penguin (Pygoscelis adeliae) Dream 32.1 \n", - "1371 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.7 \n", - "1417 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.6 \n", - "1204 Adelie Penguin (Pygoscelis adeliae) Dream 40.7 \n", - "1251 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.6 \n", - "1422 Adelie Penguin (Pygoscelis adeliae) Torgersen 35.7 \n", - "1394 Adelie Penguin (Pygoscelis adeliae) Torgersen 40.2 \n", - "1163 Adelie Penguin (Pygoscelis adeliae) Dream 36.4 \n", - "1329 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.1 \n", - "1406 Adelie Penguin (Pygoscelis adeliae) Torgersen 44.1 \n", - "1196 Adelie Penguin (Pygoscelis adeliae) Dream 36.5 \n", - "1228 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.6 \n", - "1412 Adelie Penguin (Pygoscelis adeliae) Torgersen 40.3 \n", - "1142 Adelie Penguin (Pygoscelis adeliae) Dream 35.7 \n", - "1430 Adelie Penguin (Pygoscelis adeliae) Torgersen 33.5 \n", - "1333 Adelie Penguin (Pygoscelis adeliae) Biscoe 43.2 \n", - "1414 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.7 \n", - "1197 Adelie Penguin (Pygoscelis adeliae) Dream 41.1 \n", - "1443 Adelie Penguin (Pygoscelis adeliae) Torgersen 40.6 \n", - "1295 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.0 \n", - "1207 Adelie Penguin (Pygoscelis adeliae) Dream 38.8 \n", - "1349 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.2 \n", - "1350 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.8 \n", - "1351 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.1 \n", - "1116 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "tag_number \n", - "1172 15.5 188.0 3050.0 FEMALE \n", - "1371 16.0 183.0 3075.0 FEMALE \n", - "1417 17.0 188.0 2900.0 FEMALE \n", - "1204 17.0 190.0 3725.0 MALE \n", - "1251 17.0 185.0 3600.0 FEMALE \n", - "1422 17.0 189.0 3350.0 FEMALE \n", - "1394 17.0 176.0 3450.0 FEMALE \n", - "1163 17.0 195.0 3325.0 FEMALE \n", - "1329 17.0 181.0 3175.0 FEMALE \n", - "1406 18.0 210.0 4000.0 MALE \n", - "1196 18.0 182.0 3150.0 FEMALE \n", - "1228 18.0 192.0 3950.0 MALE \n", - "1412 18.0 195.0 3250.0 FEMALE \n", - "1142 18.0 202.0 3550.0 FEMALE \n", - "1430 19.0 190.0 3600.0 FEMALE \n", - "1333 19.0 197.0 4775.0 MALE \n", - "1414 19.0 195.0 3450.0 FEMALE \n", - "1197 19.0 182.0 3425.0 MALE \n", - "1443 19.0 199.0 4000.0 MALE \n", - "1295 20.0 203.0 4725.0 MALE \n", - "1207 20.0 190.0 3950.0 MALE \n", - "1349 20.0 190.0 3900.0 MALE \n", - "1350 20.0 190.0 4250.0 MALE \n", - "1351 16.5 198.0 3825.0 FEMALE \n", - "1116 16.5 185.0 3400.0 FEMALE \n", - "...\n", - "\n", - "[146 rows x 7 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get all the rows with adelie penguins\n", - "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", - "\n", - "# separate out the rows that have a body mass measurement\n", - "training_data = adelie_data[adelie_data.body_mass_g.notnull()]\n", - "\n", - "# we noticed there were also some rows that were missing other values,\n", - "# lets remove these so they don't affect our results\n", - "training_data = training_data.dropna()\n", - "\n", - "# lets take a quick peek and make sure things look right:\n", - "training_data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d55a39f9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "species string[pyarrow]\n", - "island string[pyarrow]\n", - "culmen_length_mm Float64\n", - "culmen_depth_mm Float64\n", - "flipper_length_mm Float64\n", - "body_mass_g Float64\n", - "sex string[pyarrow]\n", - "dtype: object" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# we'll look at the schema too:\n", - "training_data.dtypes" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "59d374b5", - "metadata": {}, - "source": [ - "Great! Now lets configure a linear regression model to predict body mass from the other columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "18c4cecf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import bigframes.ml.linear_model as ml\n", - "\n", - "model = ml.LinearRegression()\n", - "model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6e54a1a2", - "metadata": {}, - "source": [ - "As in SKLearn, an unfitted model object is just a bundle of parameters." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a2060cf1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'fit_intercept': True}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# lets view the parameters\n", - "model.get_params()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8e25fe41", - "metadata": {}, - "source": [ - "For this task, really all the default options are fine. But just so we can see how configuration works, lets specify that we want to use gradient descent to find the solution:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "327e2232", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.optimize_strategy = \"BATCH_GRADIENT_DESCENT\"\n", - "model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "2c2e0835", - "metadata": {}, - "source": [ - "BigQuery models provide a couple of extra conveniences:\n", - "\n", - "1. By default, they will automatically perform feature engineering on the inputs - encoding our string columns and scaling our numeric columns.\n", - "2. By default, they will also automatically manage the test/training data split for us.\n", - "\n", - "So all we need to do is hook our chosen feature and label columns into the model and call .fit()!" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "085c9a99", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_train = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "y_train = training_data[['body_mass_g']]\n", - "model.fit(X_train, y_train)\n", - "model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "9e76e10c", - "metadata": {}, - "source": [ - "...and there, we've successfully trained a linear regressor model. Lets see how it performs, using the automatic data split:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "c9458c02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0223.87876378553.6016340.005614181.3309110.6239510.623951
\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 223.878763 78553.601634 0.005614 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 181.330911 0.623951 0.623951 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.score(X_train, y_train)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f0b39603", - "metadata": {}, - "source": [ - "Great! The model seems useful, predicting 62% of the variance.\n", - "\n", - "We realize we made a mistake though - we're trying to predict mass using a linear model, mass will increase with the cube of the penguin's size, whereas our inputs are linear with size. Can we improve our model by cubing them?" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "b94eddc7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\ndef cubify(penguin_df):\\n penguin_df.culmen_length_mm = train_x.culmen_length_mm.pow(3)\\n penguin_df.culmen_depth_mm = train_x.culmen_depth_mm.pow(3)\\n penguin_df.flipper_length_mm = train_x.flipper_length_mm.pow(3)\\n\\ncubify(train_x)\\ntrain_x\\n'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# SKIP THIS STEP (not yet work working in BigQuery DataFrame)\n", - "\n", - "# lets define a preprocessing step that adjust the linear measurements to use the cube\n", - "'''\n", - "def cubify(penguin_df):\n", - " penguin_df.culmen_length_mm = X_train.culmen_length_mm.pow(3)\n", - " penguin_df.culmen_depth_mm = X_train.culmen_depth_mm.pow(3)\n", - " penguin_df.flipper_length_mm = X_train.flipper_length_mm.pow(3)\n", - "\n", - "cubify(X_train)\n", - "X_train\n", - "'''" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "1b0e3f02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nmodel.fit(train_x, train_y)\\nmodel.evaluate()\\n'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# AS ABOVE, SKIP FOR NOW\n", - "'''\n", - "model.fit(X_train, y_train)\n", - "model.evaluate()\n", - "'''" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "45c5e755", - "metadata": {}, - "source": [ - "Now that we're satisfied with our model, lets see what it predicts for those Adelie penguins with no body mass measurement:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f21ebc1f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_body_mass_g
tag_number
13933459.735118
15244304.175638
15233471.668379
15253947.881639
\n", - "
[4 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "tag_number \n", - "1393 3459.735118\n", - "1524 4304.175638\n", - "1523 3471.668379\n", - "1525 3947.881639\n", - "\n", - "[4 rows x 1 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Lets predict the missing observations\n", - "missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]\n", - "\n", - "model.predict(missing_body_mass)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "e66bd0b0", - "metadata": {}, - "source": [ - "Because we created it without a name, it was just a temporary model that will disappear after 24 hours. \n", - "\n", - "We decide that this approach is promising, so lets tell BigQuery to save it." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c508691b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.to_gbq(\"bqml_tutorial.penguins_model\", replace=True)\n", - "model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "46abef08", - "metadata": {}, - "source": [ - "We can now use this model anywhere in BigQuery with this name. We can also load\n", - "it again in our BigQuery DataFrames session and evaluate or inference it without\n", - "needing to retrain it:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "0c87e972", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model = bigframes.pandas.read_gbq_model(\"bqml_tutorial.penguins_model\")\n", - "model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "d6ab8def", - "metadata": {}, - "source": [ - "And of course we can retrain it if we like. Lets make another version that is based on all the penguins, so we can test that assumption we made at the beginning that it would be best to separate them:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "f4960452", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0224.71743379527.8796230.005693169.2358690.6192870.619287
\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 224.717433 79527.879623 0.005693 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 169.235869 0.619287 0.619287 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# This time we'll take all the training data, for all species\n", - "training_data = df[df.body_mass_g.notnull()]\n", - "training_data = training_data.dropna()\n", - "\n", - "# And we'll include species in our features\n", - "X_train = training_data[['species', 'island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "y_train = training_data[['body_mass_g']]\n", - "model.fit(X_train, y_train)\n", - "\n", - "# And we'll evaluate it on the Adelie penguins only\n", - "adelie_data = training_data[training_data.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", - "X_test = adelie_data[['species', 'island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "y_test = adelie_data[['body_mass_g']]\n", - "model.score(X_test, y_test)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7d101140", - "metadata": {}, - "source": [ - "It looks like the conservationists were right! Including other species, even though it gave us more training data, worsened prediction on the Adelie penguins." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7f3fe50d", - "metadata": {}, - "source": [ - "===============================================\n", - "\n", - "**Everything below this line not yet implemented**" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "62577c72", - "metadata": {}, - "source": [ - "We want to productionalize this model, so lets start publishing it to the vertex model registry ([prerequisites](https://cloud.google.com/bigquery-ml/docs/managing-models-vertex#prerequisites))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b82e79ee", - "metadata": {}, - "outputs": [], - "source": [ - "model.publish(\n", - " registry=\"vertex_ai\",\n", - " vertex_ai_model_version_aliases=[\"experimental\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "69d2482c", - "metadata": {}, - "source": [ - "Now when we fit the model, we can see it published here: https://console.cloud.google.com/vertex-ai/models" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b97d9b64", - "metadata": {}, - "source": [ - "# Custom feature engineering" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c837ace9", - "metadata": {}, - "source": [ - "So far, we've relied on BigQuery to do our feature engineering for us. What if we want to do it manually?\n", - "\n", - "BigQuery DataFrames provides a way to do this using Pipelines." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "480cb12f", - "metadata": {}, - "outputs": [], - "source": [ - "from bigframes.ml.pipeline import Pipeline\n", - "from bigframes.ml.preprocessing import StandardScaler\n", - "\n", - "pipe = Pipeline([\n", - " ('scaler', StandardScaler()),\n", - " ('linreg', LinearRegression())\n", - "])\n", - "\n", - "pipe.fit(X_train, y_train)\n", - "pipe.evaluate()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "9a0e7d19", - "metadata": {}, - "source": [ - "We then can then save the entire pipeline to BigQuery, BigQuery will save this as a single model, with the pre-processing steps embedded in the TRANSFORM property:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d1831ed", - "metadata": {}, - "outputs": [], - "source": [ - "pipe.to_gbq(\"bqml_tutorial.penguins_pipeline\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f6b60898", - "metadata": {}, - "source": [ - "# Custom data split" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "60ac0174", - "metadata": {}, - "source": [ - "BigQuery has also managed splitting out our training data. What if we want to do this manually?\n", - "\n", - "*TODO: Write this section*" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "vscode": { - "interpreter": { - "hash": "a850322d07d9bdc9ec5f301d307e048bcab2390ae395e1cbce9335f4e081e5e2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/experimental/multimodal_dataframe.ipynb b/notebooks/experimental/multimodal_dataframe.ipynb new file mode 100644 index 0000000000..9c76654a53 --- /dev/null +++ b/notebooks/experimental/multimodal_dataframe.ipynb @@ -0,0 +1,1078 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YOrUAvz6DMw-" + }, + "source": [ + "# BigFrames Multimodal DataFrame\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is introducing BigFrames experimental Multimodal features:\n", + "1. Create Multimodal DataFrame\n", + "2. Combine unstructured data with structured data\n", + "3. Conduct image transformations\n", + "4. Use LLM models to ask questions and generate embeddings on images\n", + "5. PDF chunking function" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PEAJQQ6AFg-n" + }, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bGyhLnfEeB0X", + "outputId": "83ac8b64-3f44-4d43-d089-28a5026cbb42" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/_config/experiment_options.py:68: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject\n", + "to change in the future.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project, project needs to be allowlisted go/bq-multimodal-allowlist (internal)\n", + "# User must have https://cloud.google.com/bigquery/docs/use-bigquery-dataframes#permissions to use bigframes, BQ connection admin/user to create/use connections, BQ ObjRef permissions for ObjectRef and BQ routines permissions for using transform functions.\n", + "# Or simply has BQ Admin role for all.\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "# Flag to enable the feature\n", + "bigframes.options.experiments.blob = True\n", + "\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ifKOq7VZGtZy" + }, + "source": [ + "### 1. Create Multimodal DataFrame\n", + "There are several ways to create Multimodal DataFrame. The easiest way is from the wiledcard paths." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fx6YcZJbeYru", + "outputId": "d707954a-0dd0-4c50-b7bf-36b140cf76cf" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + } + ], + "source": [ + "# Create blob columns from wildcard path.\n", + "df_image = bpd.from_glob_path(\n", + " \"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*\", name=\"image\"\n", + ")\n", + "# Other ways are: from string uri column\n", + "# df = bpd.DataFrame({\"uri\": [\"gs:///\", \"gs:///\"]})\n", + "# df[\"blob_col\"] = df[\"uri\"].str.to_blob()\n", + "\n", + "# From an existing object table\n", + "# df = bpd.read_gbq_object_table(\"\", name=\"blob_col\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 487 + }, + "id": "HhCb8jRsLe9B", + "outputId": "03081cf9-3a22-42c9-b38f-649f592fdada" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image
0
1
2
3
4
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " image\n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame\n", + "df_image = df_image.head(5)\n", + "df_image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b6RRZb3qPi_T" + }, + "source": [ + "### 2. Combine unstructured data with structured data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4YJCdmLtR-qu" + }, + "source": [ + "Now you can put more information into the table to describe the files. Such as author info from inputs, or other metadata from the gcs object itself." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "YYYVn7NDH0Me" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageauthorcontent_typesizeupdated
0aliceimage/png14894052025-03-20 17:44:58+00:00
1bobimage/png15380072025-03-20 17:44:56+00:00
2bobimage/png12375512025-03-20 17:45:14+00:00
3aliceimage/png10404552025-03-20 17:44:45+00:00
4bobimage/png15179382025-03-20 17:45:05+00:00
\n", + "

5 rows × 5 columns

\n", + "
[5 rows x 5 columns in total]" + ], + "text/plain": [ + " image author content_type \\\n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "\n", + " size updated \n", + "0 1489405 2025-03-20 17:44:58+00:00 \n", + "1 1538007 2025-03-20 17:44:56+00:00 \n", + "2 1237551 2025-03-20 17:45:14+00:00 \n", + "3 1040455 2025-03-20 17:44:45+00:00 \n", + "4 1517938 2025-03-20 17:45:05+00:00 \n", + "\n", + "[5 rows x 5 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Combine unstructured data with structured data\n", + "df_image[\"author\"] = [\"alice\", \"bob\", \"bob\", \"alice\", \"bob\"] # type: ignore\n", + "df_image[\"content_type\"] = df_image[\"image\"].blob.content_type()\n", + "df_image[\"size\"] = df_image[\"image\"].blob.size()\n", + "df_image[\"updated\"] = df_image[\"image\"].blob.updated()\n", + "df_image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NUd4Kog_QLRS" + }, + "source": [ + "Then you can filter the rows based on the structured data. And for different content types, you can display them respectively or together." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "UGuAk9PNDRF3", + "outputId": "73feb33d-4a05-48fb-96e5-3c48c2a456f3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# filter images and display, you can also display audio and video types\n", + "df_image[df_image[\"author\"] == \"alice\"][\"image\"].blob.display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1IJuakwJTZey" + }, + "source": [ + "### 3. Conduct image transformations\n", + "BigFrames Multimodal DataFrame provides image(and other) transformation functions. Such as image_blur, image_resize and image_normalize. The output can be saved to GCS folders or to BQ as bytes." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VWsl5BBPJ6N7", + "outputId": "45d2356e-322b-4982-cfa7-42d034dc4344" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(self, *args, **kwargs)\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(self, *args, **kwargs)\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(self, *args, **kwargs)\n" + ] + } + ], + "source": [ + "df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n", + " (20, 20), dst=\"gs://bigframes_blob_test/image_blur_transformed/\"\n", + ")\n", + "df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n", + " (300, 200), dst=\"gs://bigframes_blob_test/image_resize_transformed/\"\n", + ")\n", + "df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n", + " alpha=50.0,\n", + " beta=150.0,\n", + " norm_type=\"minmax\",\n", + " dst=\"gs://bigframes_blob_test/image_normalize_transformed/\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rWCAGC8w64vU", + "outputId": "d7d456f0-8b56-492c-fe1b-967e9664d813" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(self, *args, **kwargs)\n" + ] + } + ], + "source": [ + "# You can also chain functions together\n", + "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=\"gs://bigframes_blob_test/image_blur_resize_transformed/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 605 + }, + "id": "6NGK6GYSU44B", + "outputId": "859101c1-2ee4-4f9a-e250-e8947127420a" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageauthorcontent_typesizeupdatedblurredresizednormalizedblur_resized
0aliceimage/png14894052025-03-20 17:44:58+00:00
1bobimage/png15380072025-03-20 17:44:56+00:00
2bobimage/png12375512025-03-20 17:45:14+00:00
3aliceimage/png10404552025-03-20 17:44:45+00:00
4bobimage/png15179382025-03-20 17:45:05+00:00
\n", + "

5 rows × 9 columns

\n", + "
[5 rows x 9 columns in total]" + ], + "text/plain": [ + " image author content_type \\\n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "\n", + " size updated \\\n", + "0 1489405 2025-03-20 17:44:58+00:00 \n", + "1 1538007 2025-03-20 17:44:56+00:00 \n", + "2 1237551 2025-03-20 17:45:14+00:00 \n", + "3 1040455 2025-03-20 17:44:45+00:00 \n", + "4 1517938 2025-03-20 17:45:05+00:00 \n", + "\n", + " blurred \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "\n", + " resized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "\n", + " normalized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "\n", + " blur_resized \n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Euk5saeVVdTP" + }, + "source": [ + "### 4. Use LLM models to ask questions and generate embeddings on images" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "mRUGfcaFVW-3" + }, + "outputs": [], + "source": [ + "from bigframes.ml import llm\n", + "gemini = llm.GeminiTextGenerator(model_name=\"gemini-1.5-flash-002\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 657 + }, + "id": "DNFP7CbjWdR9", + "outputId": "3f90a062-0abc-4bce-f53c-db57b06a14b9" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:107: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultimage
0That's a bag of **rabbit food** from the brand **Fluffy Buns**. The specific product is labeled as \"Ranbhow's trood.flee!\" (which appears to be a playful brand name).
1That's hay. More specifically, it looks like a type of grass hay, often used as feed for small animals like rabbits, guinea pigs, and chinchillas.
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " ml_generate_text_llm_result \\\n", + "0 That's a bag of **rabbit food** from the brand... \n", + "1 That's hay. More specifically, it looks like ... \n", + "\n", + " image \n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Ask the same question on the images\n", + "df_image = df_image.head(2)\n", + "answer = gemini.predict(df_image, prompt=[\"what item is it?\", df_image[\"image\"]])\n", + "answer[[\"ml_generate_text_llm_result\", \"image\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "IG3J3HsKhyBY" + }, + "outputs": [], + "source": [ + "# Ask different questions\n", + "df_image[\"question\"] = [\"what item is it?\", \"what color is the picture?\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 657 + }, + "id": "qKOb765IiVuD", + "outputId": "731bafad-ea29-463f-c8c1-cb7acfd70e5d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:107: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultimage
0That's a bag of **Fluffy Buns Rabbit Food**. It's a blend of various ingredients designed as food for rabbits.
1The picture is primarily light green. There are some slightly darker green and yellowish-green shades mixed in, but the overall color is light green.
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " ml_generate_text_llm_result \\\n", + "0 That's a bag of **Fluffy Buns Rabbit Food**. ... \n", + "1 The picture is primarily light green. There a... \n", + "\n", + " image \n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer_alt = gemini.predict(df_image, prompt=[df_image[\"question\"], df_image[\"image\"]])\n", + "answer_alt[[\"ml_generate_text_llm_result\", \"image\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "KATVv2CO5RT1", + "outputId": "6ec01f27-70b6-4f69-c545-e5e3c879480c" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:107: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_embedding_resultml_generate_embedding_statusml_generate_embedding_start_secml_generate_embedding_end_seccontent
0[ 0.01182145 0.01575819 0.06243018 ... 0.00010706 -0.03063935\n", + " -0.05756916]{\"access_urls\":{\"expiry_time\":\"2025-04-09T02:36:17Z\",\"read_url\":\"https://storage.googleapis.com/cloud-samples-data/bigquery%2Ftutorials%2Fcymbal-pets%2Fimages%2Ffluffy-buns-rabbit-food.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20250408%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250408T203617Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&generation=1742492698095241&X-Goog-Signature=860fb5fbf48778f66f0ec2d141b26dd7ce2de7cacc427491f5cc3f420361770e33936c79552a562a3db31c8e02a9ea73e73ac9c7c379dfa0e213eda456c48243acea3227c3e9e786859b19e2b74718d7c3447f09ba371d77e3df65a9e2936c9b9ad5ad7ba359bfaa9fc3a2785d32359a9d50ee64f90f6e7d3a20a5c13f38f932c83b143dd2abdd31f0b35ab60aa21293d2cbf7ea780b13ef02d6b1f9aa56538a498d3da13798a1cbe2535b118caeb35f1e5be36d09c9593796b5ecf8b171d4915735644a94d19d7e78351e475da7b75f72fc8f88b2607ce8d1fb53d7dc2aa16da3b6ed2130fd700cbc797d1a6cc495833945b3bdfaf933b9a4dc70ff3299ab4f\",\"write_url\":\"\"},\"objectref\":{\"authorizer\":\"bigframes-dev.us.bigframes-default-connection\",\"details\":{\"gcs_metadata\":{\"content_type\":\"image/png\",\"md5_hash\":\"4c01d79182ea7580183a2168076e16b8\",\"size\":1489405,\"updated\":1742492698000000}},\"uri\":\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/fluffy-buns-rabbit-food.png\",\"version\":\"1742492698095241\"}}
1[ 0.02554693 0.01508185 0.04101892 ... -0.02417112 -0.01356636\n", + " -0.01999673]{\"access_urls\":{\"expiry_time\":\"2025-04-09T02:36:17Z\",\"read_url\":\"https://storage.googleapis.com/cloud-samples-data/bigquery%2Ftutorials%2Fcymbal-pets%2Fimages%2Ffluffy-buns-guinea-pig-hay.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20250408%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250408T203617Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&generation=1742492696656039&X-Goog-Signature=192e852a5296d31a048af459afe3dc539e2bbf90c65bc2997219e7822bd0ca2858b8e04475e12d14d63d295b45e51403b4f4585a6b66c8b0dbc3adf19e135a93687aeff7ba675eec2aeddb4a1cb4d2b83bee22c7c2de80287af63158a85ee56fa1daccbf31bf42d57e5724ea24bdd630a8a1930d70a5d38fb0340d846848039f53bf4efbc21da6df9a7d91fec727385018b159e4fc53fce0b57ab0c77583361bc4e10b2a7080aafa288789240e565eb58cb9abf2bd298732fddaad4f32472110b2607f6b3a21d9fbce1fc3ecb23caf967a4e3ff5101ae29fc6c65b888930a1306c8deb3b569997a0a364325b3ac0350ff671f2682d9a8a4a96bfac28eb9f9fd8\",\"write_url\":\"\"},\"objectref\":{\"authorizer\":\"bigframes-dev.us.bigframes-default-connection\",\"details\":{\"gcs_metadata\":{\"content_type\":\"image/png\",\"md5_hash\":\"0888367a63729f5a42f4a041596f635d\",\"size\":1538007,\"updated\":1742492696000000}},\"uri\":\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/fluffy-buns-guinea-pig-hay.png\",\"version\":\"1742492696656039\"}}
\n", + "

2 rows × 5 columns

\n", + "
[2 rows x 5 columns in total]" + ], + "text/plain": [ + " ml_generate_embedding_result \\\n", + "0 [ 0.01182145 0.01575819 0.06243018 ... 0.00... \n", + "1 [ 0.02554693 0.01508185 0.04101892 ... -0.02... \n", + "\n", + " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", + "0 \n", + "1 \n", + "\n", + " ml_generate_embedding_end_sec \\\n", + "0 \n", + "1 \n", + "\n", + " content \n", + "0 {\"access_urls\":{\"expiry_time\":\"2025-04-09T02:3... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2025-04-09T02:3... \n", + "\n", + "[2 rows x 5 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate embeddings.\n", + "embed_model = llm.MultimodalEmbeddingGenerator()\n", + "embeddings = embed_model.predict(df_image[\"image\"])\n", + "embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iRUi8AjG7cIf" + }, + "source": [ + "### 5. PDF chunking function" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "oDDuYtUm5Yiy" + }, + "outputs": [], + "source": [ + "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7jLpMYaj7nj8", + "outputId": "06d5456f-580f-4693-adff-2605104b056c" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(self, *args, **kwargs)\n" + ] + } + ], + "source": [ + "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "kaPvJATN7zlw" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n", + "0 on a level, stable surface to prevent tipping....\n", + "0 included)\n", + "to maintain the schedule during powe...\n", + "0 digits for Meal 1 will flash.\n", + "\u0000. Use the UP/DO...\n", + "0 paperclip) for 5\n", + "seconds. This will reset all ...\n", + "0 unit with a damp cloth. Do not immerse the bas...\n", + "0 continues,\n", + "contact customer support.\n", + "E2: Food ...\n", + "Name: chunked, dtype: string" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunked = df_pdf[\"chunked\"].explode()\n", + "chunked" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index d3fec469b4..fc46a43e7b 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -25,3164 +25,11 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "rWJnGj2ViouP" - }, - "source": [ - "# BigFrames AI (semantic) Operator Tutorial\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"BQ\n", - " Open in BQ Studio\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mgOrr256iouQ" - }, - "source": [ - "This notebook provides a hands-on preview of AI operator APIs powered by the Gemini model.\n", - "\n", - "The notebook is divided into two sections. The first section introduces the API syntax with examples, aiming to familiarize you with how AI operators work. The second section applies AI operators to a large real-world dataset and presents performance statistics.\n", - "\n", - "This work is inspired by [this paper](https://arxiv.org/pdf/2407.11418) and powered by BigQuery ML and Vertex AI." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ymVbJV2iouQ" - }, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vvVzFzo3iouQ" - }, - "source": [ - "First, import the BigFrames modules.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Jb9glT2ziouQ" - }, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xQiCWj7OiouQ" - }, - "source": [ - "Make sure the BigFrames version is at least `1.23.0`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LTPpI8IpiouQ" - }, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.23.0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "agxLmtlbiouR" - }, - "source": [ - "Turn on the semantic operator experiment. You will see a warning sign saying that these operators are still under experiments. If you don't turn on the experiment before using the operators, you will get `NotImplemenetedError`s." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1wXqdDr8iouR" - }, - "outputs": [], - "source": [ - "bigframes.options.experiments.semantic_operators = True" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W8TPUvnsqxhv" - }, - "source": [ - "Specify your GCP project and location." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vCkraKOeqJFl" - }, - "outputs": [], - "source": [ - "bpd.options.bigquery.project = 'YOUR_PROJECT_ID'\n", - "bpd.options.bigquery.location = 'US'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n95MFlS0iouR" - }, - "source": [ - "**Optional**: turn off the display of progress bar so that only the operation results will be printed out" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5r6ahx7MiouR" - }, - "outputs": [], - "source": [ - "# bpd.options.display.progress_bar = None" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "93iYvp7niouR" - }, - "source": [ - "Create LLM instances. They will be passed in as parameters for each semantic operator.\n", - "\n", - "This tutorial uses the \"gemini-1.5-flash-002\" model for text generation and \"text-embedding-005\" for embedding. While these are recommended, you can choose [other Vertex AI LLM models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) based on your needs and availability. Ensure you have [sufficient quota](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas) for your chosen models and adjust it if necessary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tHkymaLNiouR" - }, - "outputs": [], - "source": [ - "from bigframes.ml import llm\n", - "gemini_model = llm.GeminiTextGenerator(model_name=\"gemini-1.5-flash-001\")\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mbFDcvnPiouR" - }, - "source": [ - "**Note**: semantic operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.sem_ops_confirmation_threshold` at `version 1.31.0` so that the BigFrames will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", - "\n", - "The default threshold is 0, which means the operators will always ask for confirmations. You are free to adjust the value as needed. You can also set the threshold to `None` to disable this feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "F4dZm4b7iouR" - }, - "outputs": [], - "source": [ - "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", - " bigframes.options.compute.semantic_ops_confirmation_threshold = 1000" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_dEA3G9RiouR" - }, - "source": [ - "If you would like your operations to fail automatically when the data is too large, set `bigframes.options.compute.semantic_ops_threshold_autofail` to `True`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BoUK-cpbiouS" - }, - "outputs": [], - "source": [ - "# if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", - "# bigframes.options.compute.semantic_ops_threshold_autofail = True" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hQft3o3OiouS" - }, - "source": [ - "# API Samples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dt5Kl-QGiouS" - }, - "source": [ - "You will learn about each semantic operator by trying some examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J7XAT459iouS" - }, - "source": [ - "## Semantic Filtering" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9d5HUIvliouS" - }, - "source": [ - "Semantic filtering allows you to filter your dataframe based on the instruction (i.e. prompt) you provided.\n", - "\n", - "First, create a dataframe:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "NDpCRGd_iouS", - "outputId": "5048c935-06d3-4ef1-ad87-72e14a30b1b7" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countrycity
0USASeattle
1GermanyBerlin
2JapanKyoto
\n", - "

3 rows × 2 columns

\n", - "
[3 rows x 2 columns in total]" - ], - "text/plain": [ - " country city\n", - "0 USA Seattle\n", - "1 Germany Berlin\n", - "2 Japan Kyoto\n", - "\n", - "[3 rows x 2 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({'country': ['USA', 'Germany', 'Japan'], 'city': ['Seattle', 'Berlin', 'Kyoto']})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6AXmT7sniouS" - }, - "source": [ - "Now, filter this dataframe by keeping only the rows where the value in `city` column is the capital of the value in `country` column. The column references could be \"escaped\" by using a pair of braces in your instruction. In this example, your instruction should be like this:\n", - "```\n", - "The {city} is the capital of the {country}.\n", - "```\n", - "\n", - "Note that this is not a Python f-string, so you shouldn't prefix your instruction with an `f`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 127 - }, - "id": "ipW3Z_l4iouS", - "outputId": "ad447459-225a-419c-d4c8-fedac4a9ed0f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countrycity
1GermanyBerlin
\n", - "

1 rows × 2 columns

\n", - "
[1 rows x 2 columns in total]" - ], - "text/plain": [ - " country city\n", - "1 Germany Berlin\n", - "\n", - "[1 rows x 2 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.semantics.filter(\"The {city} is the capital of the {country}\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "swKvgfm1iouS" - }, - "source": [ - "The filter operator extracts the information from the referenced column to enrich your instruction with context. The instruction is then sent for the designated model for evaluation. For filtering operations, the LLM is asked to return only `True` and `False` for each row, and the operator removes the rows accordingly." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r_2AAGGoiouS" - }, - "source": [ - "## Semantic Mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vT6skC57iouS" - }, - "source": [ - "Semantic mapping allows to you to combine values from multiple columns into a single output based your instruction.\n", - "\n", - "Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "BQ7xeUK3iouS", - "outputId": "33dcb742-77ed-4bea-8dbc-1cf775102a25" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ingredient_1ingredient_2
0BunBeef Patty
1Soy BeanBittern
2SausageLong Bread
\n", - "

3 rows × 2 columns

\n", - "
[3 rows x 2 columns in total]" - ], - "text/plain": [ - " ingredient_1 ingredient_2\n", - "0 Bun Beef Patty\n", - "1 Soy Bean Bittern\n", - "2 Sausage Long Bread\n", - "\n", - "[3 rows x 2 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\n", - " \"ingredient_1\": [\"Bun\", \"Soy Bean\", \"Sausage\"],\n", - " \"ingredient_2\": [\"Beef Patty\", \"Bittern\", \"Long Bread\"]\n", - " })\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VFObP2aFiouS" - }, - "source": [ - "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the column name by setting the `output_column` parameter to hold the mapping results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "PpL24AQFiouS", - "outputId": "e7aff038-bf4b-4833-def8-fe2648e8885b" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ingredient_1ingredient_2food
0BunBeef PattyBurger
1Soy BeanBitternTofu
2SausageLong BreadHotdog
\n", - "

3 rows × 3 columns

\n", - "
[3 rows x 3 columns in total]" - ], - "text/plain": [ - " ingredient_1 ingredient_2 food\n", - "0 Bun Beef Patty Burger \n", - "\n", - "1 Soy Bean Bittern Tofu \n", - "\n", - "2 Sausage Long Bread Hotdog \n", - "\n", - "\n", - "[3 rows x 3 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.semantics.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "70WTZZfdiouS" - }, - "source": [ - "## Semantic Joining" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u93uieRaiouS" - }, - "source": [ - "Semantic joining can join two dataframes based on the instruction you provided.\n", - "\n", - "First, you prepare two dataframes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dffIGEUEiouS" - }, - "outputs": [], - "source": [ - "cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})\n", - "continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Hz0X-0RtiouS" - }, - "source": [ - "You want to join the `cities` with `continents` to form a new dataframe such that, in each row the city from the `cities` data frame is in the continent from the `continents` dataframe. You could re-use the aforementioned column reference syntax:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - }, - "id": "WPIOHEwCiouT", - "outputId": "976586c3-b5db-4088-a46a-44dfbf822ecb" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
citycontinent
0SeattleNorth America
1OttawaNorth America
2ShanghaiAsia
3New DelhiAsia
\n", - "

4 rows × 2 columns

\n", - "
[4 rows x 2 columns in total]" - ], - "text/plain": [ - " city continent\n", - "0 Seattle North America\n", - "1 Ottawa North America\n", - "2 Shanghai Asia\n", - "3 New Delhi Asia\n", - "\n", - "[4 rows x 2 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cities.semantics.join(continents, \"{city} is in {continent}\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4Qc97GMWiouT" - }, - "source": [ - "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MUEJXT1IiouT" - }, - "source": [ - "### Self Joins" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QvX-nCogiouT" - }, - "source": [ - "This self-join example is for demonstrating a special case: what happens when the joining columns exist in both data frames? It turns out that you need to provide extra information in your column references: by attaching \"left.\" and \"right.\" prefixes to your column names.\n", - "\n", - "Create an example data frame:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OIGz5sqxiouW" - }, - "outputs": [], - "source": [ - "animals = bpd.DataFrame({'animal': ['cow', 'cat', 'spider', 'elephant']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VmJbuWNniouX" - }, - "source": [ - "You want to compare the weights of these animals, and output all the pairs where the animal on the left is heavier than the animal on the right. In this case, you use `left.animal` and `right.animal` to differentiate the data sources:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 284 - }, - "id": "UHfggdhBiouX", - "outputId": "a439e3aa-1382-4244-951f-127dc8da0fe3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
animal_leftanimal_right
0cowcat
1cowspider
2catspider
3elephantcow
4elephantcat
5elephantspider
\n", - "

6 rows × 2 columns

\n", - "
[6 rows x 2 columns in total]" - ], - "text/plain": [ - " animal_left animal_right\n", - "0 cow cat\n", - "1 cow spider\n", - "2 cat spider\n", - "3 elephant cow\n", - "4 elephant cat\n", - "5 elephant spider\n", - "\n", - "[6 rows x 2 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "animals.semantics.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KONR7ywqiouX" - }, - "source": [ - "## Semantic Aggregation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "I8iNRogoiouX" - }, - "source": [ - "Semantic aggregation merges all the values in a column into one. At this moment you can only aggregate a single column in each oeprator call.\n", - "\n", - "Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 315 - }, - "id": "9tsem17aiouX", - "outputId": "1db5fa6e-b59d-41f5-9c13-db2c9ed0415b" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Movies
0Titanic
1The Wolf of Wall Street
2Killers of the Flower Moon
3The Revenant
4Inception
5Shuttle Island
6The Great Gatsby
\n", - "

7 rows × 1 columns

\n", - "
[7 rows x 1 columns in total]" - ], - "text/plain": [ - " Movies\n", - "0 Titanic\n", - "1 The Wolf of Wall Street\n", - "2 Killers of the Flower Moon\n", - "3 The Revenant\n", - "4 Inception\n", - "5 Shuttle Island\n", - "6 The Great Gatsby\n", - "\n", - "[7 rows x 1 columns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\n", - " \"Movies\": [\n", - " \"Titanic\",\n", - " \"The Wolf of Wall Street\",\n", - " \"Killers of the Flower Moon\",\n", - " \"The Revenant\",\n", - " \"Inception\",\n", - " \"Shuttle Island\",\n", - " \"The Great Gatsby\",\n", - " ],\n", - "})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uA9XpV0aiouX" - }, - "source": [ - "You ask LLM to find the oldest movie:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KzYoX3mRiouX", - "outputId": "1ac50d7b-dfa7-4c16-8daf-aeb03b6df7a5" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Titanic \n", - "\n", - "Name: Movies, dtype: string" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg_df = df.semantics.agg(\"Find the oldest movie from {Movies}. Reply with only the movie title\", model=gemini_model)\n", - "agg_df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "drvn75qJiouX" - }, - "source": [ - "Instead of going through each row one by one, this operator first batches rows to get many aggregation results. It then repeatly batches those results for aggregation, until there is only one value left. You could set the batch size with `max_agg_rows` parameter, which defaults to 10." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kU7BsyTyiouX" - }, - "source": [ - "## Semantic Top K" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s9QePXEoiouX" - }, - "source": [ - "Semantic Top K selects the top K values based on your instruction. Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bMQqtyZ2iouX" - }, - "outputs": [], - "source": [ - "df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KiljGBSCiouX" - }, - "source": [ - "You want to find the top two most popular pets:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - }, - "id": "OZv5WUGIiouX", - "outputId": "ae1cee27-cc31-455e-c4ac-c0a9a5cf4ca5" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Animals
0Corgi
1Orange Cat
\n", - "

2 rows × 1 columns

\n", - "
[2 rows x 1 columns in total]" - ], - "text/plain": [ - " Animals\n", - "0 Corgi\n", - "1 Orange Cat\n", - "\n", - "[2 rows x 1 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.semantics.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dC8fyu3aiouX" - }, - "source": [ - "Under the hood, the semantic top K operator performs pair-wise comparisons with LLM. The top K results are returned in the order of their indices instead of their ranks." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sIszJ0zPiouX" - }, - "source": [ - "## Semantic Search" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e4ojHRKAiouX" - }, - "source": [ - "Semantic search searches the most similar values to your query within a single column. Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "gnQSIZ5SiouX", - "outputId": "dd6e1ecb-1bad-4a7c-8065-e56c697d0863" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
creatures
0salmon
1sea urchin
2baboons
3frog
4chimpanzee
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " creatures\n", - "0 salmon\n", - "1 sea urchin\n", - "2 baboons\n", - "3 frog\n", - "4 chimpanzee\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\"creatures\": [\"salmon\", \"sea urchin\", \"baboons\", \"frog\", \"chimpanzee\"]})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5apfIaZMiouX" - }, - "source": [ - "You want to get the top 2 creatures that are most similar to \"monkey\":" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - }, - "id": "CkAuFgPYiouY", - "outputId": "723c7604-f53c-43d7-c754-4c91ec198dff" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
creaturessimilarity score
2baboons0.708434
4chimpanzee0.635844
\n", - "

2 rows × 2 columns

\n", - "
[2 rows x 2 columns in total]" - ], - "text/plain": [ - " creatures similarity score\n", - "2 baboons 0.708434\n", - "4 chimpanzee 0.635844\n", - "\n", - "[2 rows x 2 columns]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.semantics.search(\"creatures\", query=\"monkey\", top_k = 2, model = text_embedding_model, score_column='similarity score')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GDZeVzFTiouY" - }, - "source": [ - "Note that you are using a text embedding model this time. This model generates embedding vectors for both your query as well as the values in the search space. The operator then uses BigQuery's built-in VECTOR_SEARCH function to find the nearest neighbors of your query.\n", - "\n", - "In addition, `score_column` is an optional parameter for storing the distances between the results and your query. If not set, the score column won't be attached to the result." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EXNutIXqiouY" - }, - "source": [ - "## Semantic Similarity Join" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BhWrhQMjiouY" - }, - "source": [ - "When you want to perform multiple similarity queries in the same value space, you could use similarity join to simplify your call. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cUc7-8O6iouY" - }, - "outputs": [], - "source": [ - "df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})\n", - "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "k96WerOviouY" - }, - "source": [ - "In this example, you want to pick the most related animal from `df2` for each value in `df1`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "wPV5EkfpiouY", - "outputId": "4be1211d-0353-4b94-8c27-ebd568e8e104" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
animalanimal_1distance
0monkeybaboon0.620521
1spiderscorpion0.728024
2salmontuna0.782141
3giraffeelephant0.7135
4sparrowowl0.810864
\n", - "

5 rows × 3 columns

\n", - "
[5 rows x 3 columns in total]" - ], - "text/plain": [ - " animal animal_1 distance\n", - "0 monkey baboon 0.620521\n", - "1 spider scorpion 0.728024\n", - "2 salmon tuna 0.782141\n", - "3 giraffe elephant 0.7135\n", - "4 sparrow owl 0.810864\n", - "\n", - "[5 rows x 3 columns]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model=text_embedding_model, score_column='distance')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GplzD7v0iouY" - }, - "source": [ - "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `bigframes.options.compute.sem_ops_confirmation_threshold` option to specify a threshold." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uG6FyMH_iouY" - }, - "source": [ - "## Semantic Cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uIh3ViNciouY" - }, - "source": [ - "Semantic Cluster group similar values together. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jyQ_aT9qiouY" - }, - "outputs": [], - "source": [ - "df = bpd.DataFrame({'Product': ['Smartphone', 'Laptop', 'Coffee Maker', 'T-shirt', 'Jeans']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K3IMIFrtiouY" - }, - "source": [ - "You want to cluster these products into 3 groups:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "0Tc0DqXJiouY", - "outputId": "1c8b6e28-713c-4666-e623-3b2c42c50b30" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ProductCluster ID
0Smartphone1
1Laptop1
2Coffee Maker1
3T-shirt1
4Jeans1
\n", - "

5 rows × 2 columns

\n", - "
[5 rows x 2 columns in total]" - ], - "text/plain": [ - " Product Cluster ID\n", - "0 Smartphone 1\n", - "1 Laptop 1\n", - "2 Coffee Maker 1\n", - "3 T-shirt 1\n", - "4 Jeans 1\n", - "\n", - "[5 rows x 2 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.semantics.cluster_by(column='Product', output_column='Cluster ID', model=text_embedding_model, n_clusters=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zWIzYX3niouY" - }, - "source": [ - "This operator uses the the embedding model to generate vectors for each value, and then the KMeans algorithm for clustering." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hgj8GoQhiouY" - }, - "source": [ - "# Performance Analyses" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EZomL0BciouY" - }, - "source": [ - "In this section, you will use BigQuery's public data of hacker news to perform some heavy work. We recommend you to check the code without executing them in order to save your time and money. The execution results are attached after each cell for your reference.\n", - "\n", - "First, load 3k rows from the table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 880 - }, - "id": "wRR0SrcSiouY", - "outputId": "3b25f3a3-09c7-4396-9107-4aa4cdb4b963" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
0<NA>Well, most people aren&#x27;t alcoholics, so I...slipframe<NA>2021-06-26 02:37:56+00:00comment
1<NA>No, you don&#x27;t really <i>need</i> a smartp...vetinari<NA>2023-04-19 15:56:34+00:00comment
2<NA>It&#x27;s for the late Paul Allen RIP. Should&...lsr_ssri<NA>2018-10-16 01:07:55+00:00comment
3<NA>Yup they are dangerous. Be careful Donald Trump.Sven7<NA>2015-08-10 16:05:54+00:00comment
4<NA>Sure, it&#x27;s totally reasonable. Just point...nicoburns<NA>2020-10-05 11:20:51+00:00comment
5<NA>I wonder how long before special forces start ...autisticcurio<NA>2020-09-01 15:38:50+00:00comment
6The Impending NY Tech Apocalypse: Here's What ...<NA>gaoprea32011-09-27 22:43:27+00:00story
7<NA>Where would you relocate to? I'm assuming that...pavel_lishin<NA>2011-09-16 19:02:01+00:00comment
8Eureca beta is live. A place for your business...<NA>ricardos12012-10-15 13:09:32+00:00story
9<NA>It doesn’t work on Safari, and WebKit based br...archiewood<NA>2023-04-21 16:45:13+00:00comment
10<NA>I guess I don’t see the relevance. Vegans eat ...stevula<NA>2023-01-19 20:05:54+00:00comment
11<NA>I remember watching the American news media go...fareesh<NA>2019-06-17 19:49:17+00:00comment
12<NA>This article is incorrectly using the current ...stale2002<NA>2018-03-18 18:57:21+00:00comment
13<NA>In the firm I made my internship, we have to u...iserlohnmage<NA>2019-10-22 10:41:01+00:00comment
14<NA>The main reason it requires unsafe is for memo...comex<NA>2017-05-05 20:45:37+00:00comment
15Discord vs. IRC Rough Notes<NA>todsacerdoti482024-07-12 18:39:52+00:00story
16<NA>you have to auth again when you use apple pay.empath75<NA>2017-09-12 18:58:20+00:00comment
17<NA>It goes consumer grade, automotive, military, ...moftz<NA>2021-04-13 01:24:03+00:00comment
18<NA>I don&#x27;t have a link handy but the differe...KennyBlanken<NA>2022-05-13 16:08:38+00:00comment
19<NA>&gt; I don&#x27;t think the use case you menti...colanderman<NA>2017-09-28 05:16:06+00:00comment
20<NA>I think you need to watch it again, because yo...vladimirralev<NA>2018-12-07 11:25:52+00:00comment
21Oh dear: new Yahoo anti-spoofing measures brea...<NA>joshreads12014-04-08 13:29:50+00:00story
22How Much Warmer Was Your City in 2016?<NA>smb0612017-02-16 23:26:34+00:00story
23<NA>Except that they clearly never tried to incent...aenis<NA>2022-01-31 17:08:57+00:00comment
24Working Best at Coffee Shops<NA>GiraffeNecktie2492011-04-19 14:25:17+00:00story
\n", - "

25 rows × 6 columns

\n", - "
[3000 rows x 6 columns in total]" - ], - "text/plain": [ - " title \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 The Impending NY Tech Apocalypse: Here's What ... \n", - "7 \n", - "8 Eureca beta is live. A place for your business... \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 Discord vs. IRC Rough Notes \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 Oh dear: new Yahoo anti-spoofing measures brea... \n", - "22 How Much Warmer Was Your City in 2016? \n", - "23 \n", - "24 Working Best at Coffee Shops \n", - "\n", - " text by score \\\n", - "0 Well, most people aren't alcoholics, so I... slipframe \n", - "1 No, you don't really need a smartp... vetinari \n", - "2 It's for the late Paul Allen RIP. Should&... lsr_ssri \n", - "3 Yup they are dangerous. Be careful Donald Trump. Sven7 \n", - "4 Sure, it's totally reasonable. Just point... nicoburns \n", - "5 I wonder how long before special forces start ... autisticcurio \n", - "6 gaoprea 3 \n", - "7 Where would you relocate to? I'm assuming that... pavel_lishin \n", - "8 ricardos 1 \n", - "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "10 I guess I don’t see the relevance. Vegans eat ... stevula \n", - "11 I remember watching the American news media go... fareesh \n", - "12 This article is incorrectly using the current ... stale2002 \n", - "13 In the firm I made my internship, we have to u... iserlohnmage \n", - "14 The main reason it requires unsafe is for memo... comex \n", - "15 todsacerdoti 48 \n", - "16 you have to auth again when you use apple pay. empath75 \n", - "17 It goes consumer grade, automotive, military, ... moftz \n", - "18 I don't have a link handy but the differe... KennyBlanken \n", - "19 > I don't think the use case you menti... colanderman \n", - "20 I think you need to watch it again, because yo... vladimirralev \n", - "21 joshreads 1 \n", - "22 smb06 1 \n", - "23 Except that they clearly never tried to incent... aenis \n", - "24 GiraffeNecktie 249 \n", - "\n", - " timestamp type \n", - "0 2021-06-26 02:37:56+00:00 comment \n", - "1 2023-04-19 15:56:34+00:00 comment \n", - "2 2018-10-16 01:07:55+00:00 comment \n", - "3 2015-08-10 16:05:54+00:00 comment \n", - "4 2020-10-05 11:20:51+00:00 comment \n", - "5 2020-09-01 15:38:50+00:00 comment \n", - "6 2011-09-27 22:43:27+00:00 story \n", - "7 2011-09-16 19:02:01+00:00 comment \n", - "8 2012-10-15 13:09:32+00:00 story \n", - "9 2023-04-21 16:45:13+00:00 comment \n", - "10 2023-01-19 20:05:54+00:00 comment \n", - "11 2019-06-17 19:49:17+00:00 comment \n", - "12 2018-03-18 18:57:21+00:00 comment \n", - "13 2019-10-22 10:41:01+00:00 comment \n", - "14 2017-05-05 20:45:37+00:00 comment \n", - "15 2024-07-12 18:39:52+00:00 story \n", - "16 2017-09-12 18:58:20+00:00 comment \n", - "17 2021-04-13 01:24:03+00:00 comment \n", - "18 2022-05-13 16:08:38+00:00 comment \n", - "19 2017-09-28 05:16:06+00:00 comment \n", - "20 2018-12-07 11:25:52+00:00 comment \n", - "21 2014-04-08 13:29:50+00:00 story \n", - "22 2017-02-16 23:26:34+00:00 story \n", - "23 2022-01-31 17:08:57+00:00 comment \n", - "24 2011-04-19 14:25:17+00:00 story \n", - "...\n", - "\n", - "[3000 rows x 6 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", - "hacker_news" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3e94DPOdiouY" - }, - "source": [ - "Then, keep only the rows that have text content:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mQl8hc1biouY", - "outputId": "2b4ffa85-9d95-4a20-9040-0420c67da2d4" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2556" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news_with_texts = hacker_news[hacker_news['text'].isnull() == False]\n", - "len(hacker_news_with_texts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWalDtLDiouZ" - }, - "source": [ - "You can get an idea of the input token length by calculating the average string length." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PZeg4LCUiouZ", - "outputId": "05b67cac-6b3d-42ef-d6d6-b578a9734f4c" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "390.05125195618155" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news_with_texts['text'].str.len().mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2IXqskHHiouZ" - }, - "source": [ - "**Optional**: You can raise the confirmation threshold for a smoother experience." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EpjXQ4FViouZ" - }, - "outputs": [], - "source": [ - "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", - " bigframes.options.compute.semantic_ops_confirmation_threshold = 5000" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SYFB-X1RiouZ" - }, - "source": [ - "Now it's LLM's turn. You want to keep only the rows whose texts are talking about iPhone. This will take several minutes to finish." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "rditQlmoiouZ", - "outputId": "2b44dcbf-2ef5-4119-ca05-9b082db9c0c1" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
9<NA>It doesn’t work on Safari, and WebKit based br...archiewood<NA>2023-04-21 16:45:13+00:00comment
420<NA>Well last time I got angry down votes for sayi...drieddust<NA>2021-01-11 19:27:27+00:00comment
815<NA>New iPhone should be announced on September. L...meerita<NA>2019-07-30 20:54:42+00:00comment
1516<NA>Why would this take a week? i(phone)OS was ori...TheOtherHobbes<NA>2021-06-08 09:25:24+00:00comment
1563<NA>&gt;or because Apple drama brings many clicks?...weberer<NA>2022-09-05 13:16:02+00:00comment
\n", - "

5 rows × 6 columns

\n", - "
[5 rows x 6 columns in total]" - ], - "text/plain": [ - " title text by \\\n", - "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "420 Well last time I got angry down votes for sayi... drieddust \n", - "815 New iPhone should be announced on September. L... meerita \n", - "1516 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", - "1563 >or because Apple drama brings many clicks?... weberer \n", - "\n", - " score timestamp type \n", - "9 2023-04-21 16:45:13+00:00 comment \n", - "420 2021-01-11 19:27:27+00:00 comment \n", - "815 2019-07-30 20:54:42+00:00 comment \n", - "1516 2021-06-08 09:25:24+00:00 comment \n", - "1563 2022-09-05 13:16:02+00:00 comment \n", - "\n", - "[5 rows x 6 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iphone_comments = hacker_news_with_texts.semantics.filter(\"The {text} is mainly focused on iPhone\", gemini_model)\n", - "iphone_comments" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yl24sJFIiouZ" - }, + "metadata": {}, "source": [ - "The performance of the semantic operators depends on the length of your input as well as your quota. Here are our benchmarks for running the previous operation over data of different sizes. Here are the estimates supposing your quota is [the default 200 requests per minute](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas):\n", + "Semantic Operators have been deprecated since version 1.42.0. Please use AI Operators instead.\n", "\n", - "* 800 Rows -> ~4m\n", - "* 2550 Rows -> ~13m\n", - "* 8500 Rows -> ~40m\n", - "\n", - "These numbers can give you a general idea of how fast the operators run." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eo4nfISuiouZ" - }, - "source": [ - "Now, use LLM to summarize the sentiments towards iPhone:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "IlKBrNxUiouZ", - "outputId": "818d01e4-1cdf-42a2-9e02-61c4736a8905" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptypesentiment
9<NA>It doesn’t work on Safari, and WebKit based br...archiewood<NA>2023-04-21 16:45:13+00:00commentFrustrated, but hopeful.
420<NA>Well last time I got angry down votes for sayi...drieddust<NA>2021-01-11 19:27:27+00:00commentFrustrated and angry.
815<NA>New iPhone should be announced on September. L...meerita<NA>2019-07-30 20:54:42+00:00commentExcited anticipation.
1516<NA>Why would this take a week? i(phone)OS was ori...TheOtherHobbes<NA>2021-06-08 09:25:24+00:00commentFrustrated, critical, obvious.
1563<NA>&gt;or because Apple drama brings many clicks?...weberer<NA>2022-09-05 13:16:02+00:00commentNegative, clickbait, Apple.
\n", - "

5 rows × 7 columns

\n", - "
[5 rows x 7 columns in total]" - ], - "text/plain": [ - " title text by \\\n", - "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "420 Well last time I got angry down votes for sayi... drieddust \n", - "815 New iPhone should be announced on September. L... meerita \n", - "1516 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", - "1563 >or because Apple drama brings many clicks?... weberer \n", - "\n", - " score timestamp type \\\n", - "9 2023-04-21 16:45:13+00:00 comment \n", - "420 2021-01-11 19:27:27+00:00 comment \n", - "815 2019-07-30 20:54:42+00:00 comment \n", - "1516 2021-06-08 09:25:24+00:00 comment \n", - "1563 2022-09-05 13:16:02+00:00 comment \n", - "\n", - " sentiment \n", - "9 Frustrated, but hopeful. \n", - " \n", - "420 Frustrated and angry. \n", - " \n", - "815 Excited anticipation. \n", - " \n", - "1516 Frustrated, critical, obvious. \n", - " \n", - "1563 Negative, clickbait, Apple. \n", - " \n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iphone_comments.semantics.map(\"Summarize the sentiment of the {text}. Your answer should have at most 3 words\", output_column=\"sentiment\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y7_16T2xiouZ" - }, - "source": [ - "Here is another example: count the number of rows whose authors have animals in their names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 880 - }, - "id": "CbGwc_uXiouZ", - "outputId": "138acca0-7fb9-495a-e797-0d42495d65e6" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
0<NA>Well, most people aren&#x27;t alcoholics, so I...slipframe<NA>2021-06-26 02:37:56+00:00comment
1<NA>No, you don&#x27;t really <i>need</i> a smartp...vetinari<NA>2023-04-19 15:56:34+00:00comment
2<NA>It&#x27;s for the late Paul Allen RIP. Should&...lsr_ssri<NA>2018-10-16 01:07:55+00:00comment
3<NA>Yup they are dangerous. Be careful Donald Trump.Sven7<NA>2015-08-10 16:05:54+00:00comment
4<NA>Sure, it&#x27;s totally reasonable. Just point...nicoburns<NA>2020-10-05 11:20:51+00:00comment
5<NA>I wonder how long before special forces start ...autisticcurio<NA>2020-09-01 15:38:50+00:00comment
6The Impending NY Tech Apocalypse: Here's What ...<NA>gaoprea32011-09-27 22:43:27+00:00story
7<NA>Where would you relocate to? I'm assuming that...pavel_lishin<NA>2011-09-16 19:02:01+00:00comment
8Eureca beta is live. A place for your business...<NA>ricardos12012-10-15 13:09:32+00:00story
9<NA>It doesn’t work on Safari, and WebKit based br...archiewood<NA>2023-04-21 16:45:13+00:00comment
10<NA>I guess I don’t see the relevance. Vegans eat ...stevula<NA>2023-01-19 20:05:54+00:00comment
11<NA>I remember watching the American news media go...fareesh<NA>2019-06-17 19:49:17+00:00comment
12<NA>This article is incorrectly using the current ...stale2002<NA>2018-03-18 18:57:21+00:00comment
13<NA>In the firm I made my internship, we have to u...iserlohnmage<NA>2019-10-22 10:41:01+00:00comment
14<NA>The main reason it requires unsafe is for memo...comex<NA>2017-05-05 20:45:37+00:00comment
15Discord vs. IRC Rough Notes<NA>todsacerdoti482024-07-12 18:39:52+00:00story
16<NA>you have to auth again when you use apple pay.empath75<NA>2017-09-12 18:58:20+00:00comment
17<NA>It goes consumer grade, automotive, military, ...moftz<NA>2021-04-13 01:24:03+00:00comment
18<NA>I don&#x27;t have a link handy but the differe...KennyBlanken<NA>2022-05-13 16:08:38+00:00comment
19<NA>&gt; I don&#x27;t think the use case you menti...colanderman<NA>2017-09-28 05:16:06+00:00comment
20<NA>I think you need to watch it again, because yo...vladimirralev<NA>2018-12-07 11:25:52+00:00comment
21Oh dear: new Yahoo anti-spoofing measures brea...<NA>joshreads12014-04-08 13:29:50+00:00story
22How Much Warmer Was Your City in 2016?<NA>smb0612017-02-16 23:26:34+00:00story
23<NA>Except that they clearly never tried to incent...aenis<NA>2022-01-31 17:08:57+00:00comment
24Working Best at Coffee Shops<NA>GiraffeNecktie2492011-04-19 14:25:17+00:00story
\n", - "

25 rows × 6 columns

\n", - "
[3000 rows x 6 columns in total]" - ], - "text/plain": [ - " title \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 The Impending NY Tech Apocalypse: Here's What ... \n", - "7 \n", - "8 Eureca beta is live. A place for your business... \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 Discord vs. IRC Rough Notes \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 Oh dear: new Yahoo anti-spoofing measures brea... \n", - "22 How Much Warmer Was Your City in 2016? \n", - "23 \n", - "24 Working Best at Coffee Shops \n", - "\n", - " text by score \\\n", - "0 Well, most people aren't alcoholics, so I... slipframe \n", - "1 No, you don't really need a smartp... vetinari \n", - "2 It's for the late Paul Allen RIP. Should&... lsr_ssri \n", - "3 Yup they are dangerous. Be careful Donald Trump. Sven7 \n", - "4 Sure, it's totally reasonable. Just point... nicoburns \n", - "5 I wonder how long before special forces start ... autisticcurio \n", - "6 gaoprea 3 \n", - "7 Where would you relocate to? I'm assuming that... pavel_lishin \n", - "8 ricardos 1 \n", - "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "10 I guess I don’t see the relevance. Vegans eat ... stevula \n", - "11 I remember watching the American news media go... fareesh \n", - "12 This article is incorrectly using the current ... stale2002 \n", - "13 In the firm I made my internship, we have to u... iserlohnmage \n", - "14 The main reason it requires unsafe is for memo... comex \n", - "15 todsacerdoti 48 \n", - "16 you have to auth again when you use apple pay. empath75 \n", - "17 It goes consumer grade, automotive, military, ... moftz \n", - "18 I don't have a link handy but the differe... KennyBlanken \n", - "19 > I don't think the use case you menti... colanderman \n", - "20 I think you need to watch it again, because yo... vladimirralev \n", - "21 joshreads 1 \n", - "22 smb06 1 \n", - "23 Except that they clearly never tried to incent... aenis \n", - "24 GiraffeNecktie 249 \n", - "\n", - " timestamp type \n", - "0 2021-06-26 02:37:56+00:00 comment \n", - "1 2023-04-19 15:56:34+00:00 comment \n", - "2 2018-10-16 01:07:55+00:00 comment \n", - "3 2015-08-10 16:05:54+00:00 comment \n", - "4 2020-10-05 11:20:51+00:00 comment \n", - "5 2020-09-01 15:38:50+00:00 comment \n", - "6 2011-09-27 22:43:27+00:00 story \n", - "7 2011-09-16 19:02:01+00:00 comment \n", - "8 2012-10-15 13:09:32+00:00 story \n", - "9 2023-04-21 16:45:13+00:00 comment \n", - "10 2023-01-19 20:05:54+00:00 comment \n", - "11 2019-06-17 19:49:17+00:00 comment \n", - "12 2018-03-18 18:57:21+00:00 comment \n", - "13 2019-10-22 10:41:01+00:00 comment \n", - "14 2017-05-05 20:45:37+00:00 comment \n", - "15 2024-07-12 18:39:52+00:00 story \n", - "16 2017-09-12 18:58:20+00:00 comment \n", - "17 2021-04-13 01:24:03+00:00 comment \n", - "18 2022-05-13 16:08:38+00:00 comment \n", - "19 2017-09-28 05:16:06+00:00 comment \n", - "20 2018-12-07 11:25:52+00:00 comment \n", - "21 2014-04-08 13:29:50+00:00 story \n", - "22 2017-02-16 23:26:34+00:00 story \n", - "23 2022-01-31 17:08:57+00:00 comment \n", - "24 2011-04-19 14:25:17+00:00 story \n", - "...\n", - "\n", - "[3000 rows x 6 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", - "hacker_news" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 880 - }, - "id": "9dzU8SNziouZ", - "outputId": "da8815c1-c411-4afc-d1ca-5e44c75b5b48" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
24Working Best at Coffee Shops<NA>GiraffeNecktie2492011-04-19 14:25:17+00:00story
98<NA>i resisted switching to chrome for months beca...catshirt<NA>2011-04-06 08:02:24+00:00comment
137FDA reverses marketing ban on Juul e-cigarettes<NA>anigbrowl22024-06-06 16:42:40+00:00story
188<NA>I think it&#x27;s more than hazing. It may be ...bayesianhorse<NA>2015-06-18 16:42:53+00:00comment
209<NA>I like the idea of moving that arrow the way h...rattray<NA>2015-06-08 02:15:30+00:00comment
228<NA>I don&#x27;t understand why a beginner would s...wolco<NA>2019-02-03 14:35:43+00:00comment
290<NA>I leaerned more with one minute of this than a...agumonkey<NA>2016-07-16 06:19:39+00:00comment
303<NA>I've suggested a <i>rationale</i> for the tabo...mechanical_fish<NA>2008-12-17 04:42:02+00:00comment
312<NA>Do you have any reference for this?<p>I&#x27;m...banashark<NA>2023-11-13 19:57:00+00:00comment
322<NA>Default search scope is an option in the Finde...kitsunesoba<NA>2017-08-13 17:15:19+00:00comment
391<NA>Orthogonality and biology aren&#x27;t friends.agumonkey<NA>2016-04-24 16:33:41+00:00comment
396<NA>I chose some random physics book that was good...prawn<NA>2011-03-27 22:29:51+00:00comment
424<NA>Seeing this get huge on Twitter. It&#x27;s the...shenanigoat<NA>2016-01-09 03:04:22+00:00comment
428<NA>Looking through the comments there are a numbe...moomin<NA>2024-10-01 14:37:04+00:00comment
429<NA>Legacy media is a tough business. GBTC is payi...arcticbull<NA>2021-04-16 16:30:33+00:00comment
436<NA>Same thing if you sell unsafe food, yet we hav...jabradoodle<NA>2023-08-03 20:47:52+00:00comment
438<NA>There was briefly a thing called HSCSD (&quot;...LeoPanthera<NA>2019-02-11 19:49:29+00:00comment
446<NA>&gt; This article is a bit comical to read and...lapcat<NA>2023-01-02 16:00:49+00:00comment
453<NA>Large positions are most likely sold off in sm...meowkit<NA>2021-01-27 23:22:48+00:00comment
507<NA>A US-based VPN (or really any VPN) is only goi...RandomBacon<NA>2019-04-05 00:58:58+00:00comment
543<NA><a href=\"https:&#x2F;&#x2F;codeberg.org&#x2F;A...ElectronBadger<NA>2023-12-13 08:13:15+00:00comment
565<NA>It’s much harder for people without hands to w...Aeolun<NA>2024-05-03 11:58:13+00:00comment
612<NA>So by using ADMIN_SL0T instead was it just set...minitoar<NA>2021-03-05 16:07:56+00:00comment
660<NA>Outstanding!cafard<NA>2022-06-09 09:51:54+00:00comment
673<NA>On the other hand, something can be said for &...babby<NA>2013-08-12 00:31:02+00:00comment
\n", - "

25 rows × 6 columns

\n", - "
[123 rows x 6 columns in total]" - ], - "text/plain": [ - " title \\\n", - "24 Working Best at Coffee Shops \n", - "98 \n", - "137 FDA reverses marketing ban on Juul e-cigarettes \n", - "188 \n", - "209 \n", - "228 \n", - "290 \n", - "303 \n", - "312 \n", - "322 \n", - "391 \n", - "396 \n", - "424 \n", - "428 \n", - "429 \n", - "436 \n", - "438 \n", - "446 \n", - "453 \n", - "507 \n", - "543 \n", - "565 \n", - "612 \n", - "660 \n", - "673 \n", - "\n", - " text by \\\n", - "24 GiraffeNecktie \n", - "98 i resisted switching to chrome for months beca... catshirt \n", - "137 anigbrowl \n", - "188 I think it's more than hazing. It may be ... bayesianhorse \n", - "209 I like the idea of moving that arrow the way h... rattray \n", - "228 I don't understand why a beginner would s... wolco \n", - "290 I leaerned more with one minute of this than a... agumonkey \n", - "303 I've suggested a rationale for the tabo... mechanical_fish \n", - "312 Do you have any reference for this?

334 rows × 7 columns

\n", + "

25 rows × 7 columns

\n", "[334 rows x 7 columns in total]" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", - "1 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Torgersen 41.4 \n", - "3 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.6 \n", - "4 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - ".. ... ... ... \n", - "339 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", - "340 Adelie Penguin (Pygoscelis adeliae) Biscoe 36.4 \n", - "341 Chinstrap penguin (Pygoscelis antarctica) Dream 40.9 \n", - "342 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", - "343 Chinstrap penguin (Pygoscelis antarctica) Dream 45.2 \n", + " species island culmen_length_mm \\\n", + "0 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "1 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", + "2 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.7 \n", + "3 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 \n", + "4 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 \n", + "5 Adelie Penguin (Pygoscelis adeliae) Torgersen 43.1 \n", + "6 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "7 Adelie Penguin (Pygoscelis adeliae) Dream 36.2 \n", + "8 Chinstrap penguin (Pygoscelis antarctica) Dream 46.0 \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe 54.3 \n", + "11 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.5 \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe 42.7 \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.0 \n", + "14 Gentoo penguin (Pygoscelis papua) Biscoe 48.5 \n", + "15 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", + "16 Gentoo penguin (Pygoscelis papua) Biscoe 50.8 \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.2 \n", + "18 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.8 \n", + "19 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", + "20 Gentoo penguin (Pygoscelis papua) Biscoe 42.9 \n", + "21 Gentoo penguin (Pygoscelis papua) Biscoe 50.4 \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe 49.0 \n", + "23 Gentoo penguin (Pygoscelis papua) Biscoe 43.4 \n", + "24 Gentoo penguin (Pygoscelis papua) Biscoe 45.0 \n", + "25 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 15.9 225.0 5400.0 MALE \n", - "1 14.5 215.0 5000.0 FEMALE \n", - "2 18.5 202.0 3875.0 MALE \n", - "3 17.0 188.0 2900.0 FEMALE \n", - "4 14.8 217.0 5200.0 FEMALE \n", - ".. ... ... ... ... \n", - "339 17.6 187.0 3425.0 FEMALE \n", - "340 17.1 184.0 2850.0 FEMALE \n", - "341 16.6 187.0 3200.0 FEMALE \n", - "342 21.1 195.0 4400.0 MALE \n", - "343 16.6 191.0 3250.0 FEMALE \n", + " culmen_depth_mm flipper_length_mm body_mass_g sex \n", + "0 16.4 223.0 5950.0 MALE \n", + "1 14.5 213.0 4400.0 FEMALE \n", + "2 16.0 183.0 3075.0 FEMALE \n", + "3 15.6 221.0 5000.0 MALE \n", + "4 13.2 211.0 4500.0 FEMALE \n", + "5 19.2 197.0 3500.0 MALE \n", + "6 15.8 215.0 5300.0 MALE \n", + "7 17.3 187.0 3300.0 FEMALE \n", + "8 18.9 195.0 4150.0 FEMALE \n", + "9 15.7 231.0 5650.0 MALE \n", + "11 17.4 186.0 3800.0 FEMALE \n", + "12 13.7 208.0 3950.0 FEMALE \n", + "13 20.0 203.0 4725.0 MALE \n", + "14 15.0 219.0 4850.0 FEMALE \n", + "15 18.2 193.0 3775.0 MALE \n", + "16 17.3 228.0 5600.0 MALE \n", + "17 14.1 217.0 4375.0 FEMALE \n", + "18 17.2 180.0 3800.0 MALE \n", + "19 18.8 203.0 4100.0 MALE \n", + "20 13.1 215.0 5000.0 FEMALE \n", + "21 15.3 224.0 5550.0 MALE \n", + "22 16.1 216.0 5550.0 MALE \n", + "23 14.4 218.0 4600.0 FEMALE \n", + "24 15.4 220.0 5050.0 MALE \n", + "25 14.0 212.0 4875.0 FEMALE \n", "...\n", "\n", "[334 rows x 7 columns]" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -253,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -277,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -286,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -297,37 +469,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 582e7c02-bcc6-412a-a513-46ee5dba7ad8 is DONE. 2.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 917ff09b-072b-4c55-b26f-1780e2e97519 is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 2f4e102d-48bc-401f-a781-39830e2c6c9b is DONE. 16.4 kB processed. Open Job" + "Query job 9ce9fb43-306d-46e9-bbe5-d98ee55143bd is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -339,7 +487,7 @@ { "data": { "text/html": [ - "Query job aabe8a28-8dce-4e00-8a8c-18e9e090e6e7 is DONE. 26.3 kB processed. Open Job" + "Query job 8c86156d-ee97-4f66-9dc1-db15ff3d8e8e is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -351,19 +499,7 @@ { "data": { "text/html": [ - "Query job ec9d8798-e28e-44bc-aa8e-44ab28f0214f is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8aa0fa94-e43e-41c6-9de3-f0a67392c47f is DONE. 48 Bytes processed. Open Job" + "Query job b8f2b382-b938-4dff-8bdb-129703ade285 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -377,10 +513,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 318.358226 151689.571141 0.009814 \n", + "0 297.36838 148892.914876 0.009057 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 255.095561 0.780659 0.783304 \n", + "0 238.424052 0.814613 0.816053 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -388,7 +524,7 @@ { "data": { "text/html": [ - "Query job bf6ef937-9583-4aa8-8313-563638465d5f is DONE. 25.9 kB processed. Open Job" + "Query job ec2968f3-1713-4617-8a26-6fe4267f8061 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -400,7 +536,7 @@ { "data": { "text/html": [ - "Query job 4c8b564c-5bbd-4447-babf-e307524962e5 is DONE. 16.4 kB processed. Open Job" + "Query job c7a1b80f-26f5-41b1-bcdc-b276af141671 is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -412,31 +548,7 @@ { "data": { "text/html": [ - "Query job cd5e337f-6d44-473d-a90b-be8a79bba6bf is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ad80012d-7c6c-4dbf-9271-2ff7f899f174 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8fc20587-d8ba-4c0f-bed9-3e1cf3c6ae52 is DONE. 48 Bytes processed. Open Job" + "Query job 82054991-c22f-41b3-9802-f16919949e26 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -450,10 +562,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 306.435423 151573.84019 0.008539 \n", + "0 307.6149 139013.303482 0.007907 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 244.2899 0.737623 0.742859 \n", + "0 266.589811 0.782835 0.794297 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -461,7 +573,7 @@ { "data": { "text/html": [ - "Query job 90286d2b-e805-4b19-8876-c9973579e9ff is DONE. 25.9 kB processed. Open Job" + "Query job 3e5ae019-7c5b-44ea-8392-85145fdb6802 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -473,7 +585,7 @@ { "data": { "text/html": [ - "Query job ceb6c8f2-16cc-4758-bde8-3e4975ba1452 is DONE. 16.4 kB processed. Open Job" + "Query job c35dfd28-504d-4d12-b039-da890b9cb51d is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -485,31 +597,7 @@ { "data": { "text/html": [ - "Query job f49434fa-a7e0-406a-bbe2-5651595e3418 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5dd7a277-10fe-4117-a354-ef8668a8b913 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4b58b016-9a50-4a66-b86c-8431faad43bf is DONE. 48 Bytes processed. Open Job" + "Query job 29ac1bb3-f864-400e-8cac-0b4c7f78ebcd is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -523,10 +611,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 253.349578 112039.741164 0.007153 \n", + "0 348.412701 180661.063512 0.01125 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 185.916761 0.823381 0.823456 \n", + "0 313.29406 0.744053 0.74537 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -534,7 +622,7 @@ { "data": { "text/html": [ - "Query job ca700ecf-0c08-4286-b979-2bc7a0bee89c is DONE. 25.9 kB processed. Open Job" + "Query job d90f5938-2894-4c93-8691-21162a2fca4c is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -546,7 +634,7 @@ { "data": { "text/html": [ - "Query job f0731e71-7754-47a2-a553-93a61e712533 is DONE. 16.4 kB processed. Open Job" + "Query job 4c6328b3-2d3f-42bb-9f83-4f8c84773c95 is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -558,31 +646,7 @@ { "data": { "text/html": [ - "Query job ae66d34d-5f0a-4297-9d41-57067ae54a9b is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7655a649-ceca-4792-b764-fb371f5872ec is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8b0634c8-73a9-422c-9644-842142dbb059 is DONE. 48 Bytes processed. Open Job" + "Query job 8a885a6a-d3ad-4569-80ce-4f57d9b86105 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -596,10 +660,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 320.381386 155234.800349 0.008638 \n", + "0 309.991882 151820.705254 0.008898 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 306.281263 0.793405 0.794504 \n", + "0 212.758708 0.694001 0.694287 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -607,19 +671,7 @@ { "data": { "text/html": [ - "Query job bb26cde9-1991-4e0a-8492-b19d15b1b7aa is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7ddd0883-492d-46bc-a588-f3cbab2474bb is DONE. 16.5 kB processed. Open Job" + "Query job d1e60370-11c8-4f49-a8d5-85417662aa51 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -631,7 +683,7 @@ { "data": { "text/html": [ - "Query job 5de571e4-d2f9-43c7-b014-3d65a3731b64 is DONE. 26.3 kB processed. Open Job" + "Query job d8e8712a-6347-4725-a27d-49810d4acc1c is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -643,19 +695,7 @@ { "data": { "text/html": [ - "Query job d20ac7d8-cd21-4a1f-a200-2dfa6373bcdb is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 235e8a80-33ea-4a95-a7d0-34e40a8ca396 is DONE. 48 Bytes processed. Open Job" + "Query job 6a0ebaa6-5572-404f-a41d-b90e2c65d948 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -669,10 +709,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 303.855563 141869.030392 0.008989 \n", + "0 256.569216 103495.042886 0.006605 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 245.102301 0.731737 0.732793 \n", + "0 222.940815 0.818589 0.832344 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -696,145 +736,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 9274ae2e-e9a7-4701-ac64-56632323d02a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 22f9477b-de02-4c07-b480-c3270a69d7e0 is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ebb192b7-4a9e-4238-b4e6-b630e2f94988 is DONE. 16.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 44441e8c-8753-41b0-b1b7-9a6c4eab8c74 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 239fed9a-b488-47da-a0df-a3b7c6ec40f4 is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f4248b2d-3430-426c-872d-8590f2878366 is DONE. 16.4 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d9f6b034-c300-4dd7-91dd-48fa912f2456 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e2f39f5b-2f4c-402a-a8d5-a7cff918508d is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 54cf3710-b5f4-4aec-b11f-0281126a151a is DONE. 16.4 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 833d13cd-ec59-499b-98f6-95ec18766698 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0120e332-0691-44a4-9198-f5c131b8f59c is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f4ba7a4c-5fd9-4f97-ab34-a8f139e7472a is DONE. 16.4 kB processed. Open Job" + "Query job 5bdcd65d-7d72-4094-be3a-cf67a1787cf4 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -846,7 +754,7 @@ { "data": { "text/html": [ - "Query job 857aadfc-2ade-429c-bef8-428e44d48c55 is DONE. 26.3 kB processed. Open Job" + "Query job bb0504b2-b656-4a08-9bf8-dcab0d188022 is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -858,7 +766,7 @@ { "data": { "text/html": [ - "Query job 906d6d34-a506-4957-b07f-7e5ed2e0634b is DONE. 25.9 kB processed. Open Job" + "Query job 8c5c4b66-9a14-455a-a3f5-99f0f522713f is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -870,7 +778,7 @@ { "data": { "text/html": [ - "Query job 498563db-3e68-4df7-a2d5-83da6adb49ed is DONE. 16.5 kB processed. Open Job" + "Query job 9c9b81de-35b6-4561-8881-57da8b73cc7f is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -882,7 +790,7 @@ { "data": { "text/html": [ - "Query job 01af95ca-6288-4253-b379-7327e1c9de88 is DONE. 26.3 kB processed. Open Job" + "Query job b781f1aa-6572-49e5-ab8d-f1908b497a1c is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -894,7 +802,7 @@ { "data": { "text/html": [ - "Query job 5ce36d32-6db1-42e5-a8cf-84bb8244a57e is DONE. 48 Bytes processed. Open Job" + "Query job 41a2a58e-0289-4d58-8e39-de286f2a91fb is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -906,7 +814,7 @@ { "data": { "text/html": [ - "Query job e05ec77d-6025-4edd-b5e3-9c4e7a124e71 is DONE. 48 Bytes processed. Open Job" + "Query job 7ee839a9-f77c-49b0-844e-8eecc1647b97 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -918,7 +826,7 @@ { "data": { "text/html": [ - "Query job 418a4a5d-2bb3-41e5-9e7c-9852389a491b is DONE. 48 Bytes processed. Open Job" + "Query job a317d488-8589-4faa-940b-e59af91caf4d is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -930,7 +838,7 @@ { "data": { "text/html": [ - "Query job b33e30da-cfed-4d6f-b227-f433d97879cb is DONE. 48 Bytes processed. Open Job" + "Query job 2de96ea8-519a-4976-a641-eb26a4bd38fb is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -942,7 +850,7 @@ { "data": { "text/html": [ - "Query job 7ad7f0c8-ecae-4ef2-bc91-0ebeb5f88e7b is DONE. 48 Bytes processed. Open Job" + "Query job 41a7d5a0-c76b-4ef3-a3da-d4d5a2ebbb0e is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -954,7 +862,7 @@ { "data": { "text/html": [ - "Query job a6e8bd12-1122-4c26-b0e1-58342238016c is DONE. 48 Bytes processed. Open Job" + "Query job 9e82ddc9-8461-4644-ba34-957a7426ff8e is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -966,7 +874,7 @@ { "data": { "text/html": [ - "Query job c553439c-9586-479c-92c5-01a0d333125b is DONE. 48 Bytes processed. Open Job" + "Query job 0fa84d07-fdfa-41c9-b601-9326a94f3a09 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -978,7 +886,7 @@ { "data": { "text/html": [ - "Query job c598d64c-26b9-49fc-afad-a6544b38cfa2 is DONE. 48 Bytes processed. Open Job" + "Query job d4495568-f1b5-431b-b892-4fc7dcbccfd5 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -990,7 +898,7 @@ { "data": { "text/html": [ - "Query job ebcb73e8-1294-4f10-b826-c495046fd714 is DONE. 48 Bytes processed. Open Job" + "Query job af1e6460-3078-4a8b-8992-9e7df9dcfbb3 is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -1002,7 +910,7 @@ { "data": { "text/html": [ - "Query job d73f57ba-a25d-4b90-b474-13d81a3e22ab is DONE. 48 Bytes processed. Open Job" + "Query job f14401bf-fd80-401a-a61d-52614fba1ca7 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -1015,53 +923,53 @@ "data": { "text/plain": [ "{'test_score': [ mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 237.154735 97636.17064 0.005571 \n", + " 0 322.341485 157616.627179 0.009137 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 187.883888 0.842018 0.846816 \n", + " 0 269.412639 0.705594 0.724882 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 304.281635 141966.045867 0.008064 \n", + " 0 289.682121 136550.318797 0.00878 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 236.096453 0.762979 0.764008 \n", + " 0 212.874686 0.799363 0.81416 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 316.380322 157332.146085 0.009699 \n", + " 0 325.358522 155218.752974 0.009606 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 222.824496 0.764607 0.765369 \n", + " 0 267.301671 0.777174 0.7782 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 309.609657 152421.826588 0.009772 \n", + " 0 286.874056 120586.575364 0.007484 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 254.163976 0.772954 0.773119 \n", + " 0 247.656578 0.79281 0.796001 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 339.339345 169760.629993 0.010597 \n", + " 0 287.989397 145947.465344 0.008447 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 312.335706 0.741167 0.74118 \n", + " 0 186.777549 0.791452 0.798825 \n", " \n", " [1 rows x 6 columns]],\n", - " 'fit_time': [18.200648623984307,\n", - " 17.565149880945683,\n", - " 18.202434757025912,\n", - " 18.04062689607963,\n", - " 19.370970834977925],\n", - " 'score_time': [4.76077218609862,\n", - " 4.577479084953666,\n", - " 4.581933492794633,\n", - " 4.741644307971001,\n", - " 5.1031754210125655]}" + " 'fit_time': [18.79181448201416,\n", + " 19.092008439009078,\n", + " 75.7446747609647,\n", + " 17.520530884969048,\n", + " 21.157033596013207],\n", + " 'score_time': [4.247669544012751,\n", + " 6.792615927988663,\n", + " 4.502274781989399,\n", + " 4.484583999030292,\n", + " 4.224339194013737]}" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1097,7 +1005,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/ml/easy_linear_regression.ipynb b/notebooks/ml/easy_linear_regression.ipynb index fdabd82a4b..5a7258a182 100644 --- a/notebooks/ml/easy_linear_regression.ipynb +++ b/notebooks/ml/easy_linear_regression.ipynb @@ -52,20 +52,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(DatasetReference('shobs-test', 'bqml_tutorial'))" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dataset = f\"{session.bqclient.project}.bqml_tutorial\"\n", "session.bqclient.create_dataset(dataset, exists_ok=True)" @@ -96,383 +85,9 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 525fc879-1f59-45e8-96b4-f9c67d244d06 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 91aa1b30-2b0e-41eb-9bfb-4f6232913b31 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Adelie Penguin (Pygoscelis adeliae)Biscoe40.118.9188.04300.0MALE
1Adelie Penguin (Pygoscelis adeliae)Torgersen39.118.7181.03750.0MALE
2Gentoo penguin (Pygoscelis papua)Biscoe47.414.6212.04725.0FEMALE
3Chinstrap penguin (Pygoscelis antarctica)Dream42.516.7187.03350.0FEMALE
4Adelie Penguin (Pygoscelis adeliae)Biscoe43.219.0197.04775.0MALE
5Gentoo penguin (Pygoscelis papua)Biscoe46.715.3219.05200.0MALE
6Adelie Penguin (Pygoscelis adeliae)Biscoe41.321.1195.04400.0MALE
7Gentoo penguin (Pygoscelis papua)Biscoe45.213.8215.04750.0FEMALE
8Gentoo penguin (Pygoscelis papua)Biscoe46.513.5210.04550.0FEMALE
9Gentoo penguin (Pygoscelis papua)Biscoe50.515.2216.05000.0FEMALE
10Gentoo penguin (Pygoscelis papua)Biscoe48.215.6221.05100.0MALE
11Adelie Penguin (Pygoscelis adeliae)Dream38.118.6190.03700.0FEMALE
12Gentoo penguin (Pygoscelis papua)Biscoe50.715.0223.05550.0MALE
13Adelie Penguin (Pygoscelis adeliae)Biscoe37.820.0190.04250.0MALE
14Adelie Penguin (Pygoscelis adeliae)Biscoe35.017.9190.03450.0FEMALE
15Gentoo penguin (Pygoscelis papua)Biscoe48.715.7208.05350.0MALE
16Adelie Penguin (Pygoscelis adeliae)Torgersen34.621.1198.04400.0MALE
17Gentoo penguin (Pygoscelis papua)Biscoe46.815.4215.05150.0MALE
18Chinstrap penguin (Pygoscelis antarctica)Dream50.320.0197.03300.0MALE
19Adelie Penguin (Pygoscelis adeliae)Dream37.218.1178.03900.0MALE
20Chinstrap penguin (Pygoscelis antarctica)Dream51.018.8203.04100.0MALE
21Adelie Penguin (Pygoscelis adeliae)Biscoe40.517.9187.03200.0FEMALE
22Gentoo penguin (Pygoscelis papua)Biscoe45.513.9210.04200.0FEMALE
23Adelie Penguin (Pygoscelis adeliae)Dream42.218.5180.03550.0FEMALE
24Chinstrap penguin (Pygoscelis antarctica)Dream51.720.3194.03775.0MALE
\n", - "

25 rows × 7 columns

\n", - "
[344 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.1 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.1 \n", - "2 Gentoo penguin (Pygoscelis papua) Biscoe 47.4 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Biscoe 43.2 \n", - "5 Gentoo penguin (Pygoscelis papua) Biscoe 46.7 \n", - "6 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", - "7 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", - "8 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "9 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", - "10 Gentoo penguin (Pygoscelis papua) Biscoe 48.2 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", - "12 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", - "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.8 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Biscoe 35.0 \n", - "15 Gentoo penguin (Pygoscelis papua) Biscoe 48.7 \n", - "16 Adelie Penguin (Pygoscelis adeliae) Torgersen 34.6 \n", - "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.8 \n", - "18 Chinstrap penguin (Pygoscelis antarctica) Dream 50.3 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", - "21 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", - "22 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 42.2 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 51.7 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.9 188.0 4300.0 MALE \n", - "1 18.7 181.0 3750.0 MALE \n", - "2 14.6 212.0 4725.0 FEMALE \n", - "3 16.7 187.0 3350.0 FEMALE \n", - "4 19.0 197.0 4775.0 MALE \n", - "5 15.3 219.0 5200.0 MALE \n", - "6 21.1 195.0 4400.0 MALE \n", - "7 13.8 215.0 4750.0 FEMALE \n", - "8 13.5 210.0 4550.0 FEMALE \n", - "9 15.2 216.0 5000.0 FEMALE \n", - "10 15.6 221.0 5100.0 MALE \n", - "11 18.6 190.0 3700.0 FEMALE \n", - "12 15.0 223.0 5550.0 MALE \n", - "13 20.0 190.0 4250.0 MALE \n", - "14 17.9 190.0 3450.0 FEMALE \n", - "15 15.7 208.0 5350.0 MALE \n", - "16 21.1 198.0 4400.0 MALE \n", - "17 15.4 215.0 5150.0 MALE \n", - "18 20.0 197.0 3300.0 MALE \n", - "19 18.1 178.0 3900.0 MALE \n", - "20 18.8 203.0 4100.0 MALE \n", - "21 17.9 187.0 3200.0 FEMALE \n", - "22 13.9 210.0 4200.0 FEMALE \n", - "23 18.5 180.0 3550.0 FEMALE \n", - "24 20.3 194.0 3775.0 MALE \n", - "...\n", - "\n", - "[344 rows x 7 columns]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# read a BigQuery table to a BigQuery DataFrame\n", "df = bigframes.pandas.read_gbq(f\"bigquery-public-data.ml_datasets.penguins\")\n", @@ -491,357 +106,9 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d2bd7c5e-2652-4c0d-8495-8ef65e89031b is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 92f0a5e5-bc61-426f-a9ef-213a1c376851 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
islandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Biscoe40.118.9188.04300.0MALE
1Torgersen39.118.7181.03750.0MALE
4Biscoe43.219.0197.04775.0MALE
6Biscoe41.321.1195.04400.0MALE
11Dream38.118.6190.03700.0FEMALE
13Biscoe37.820.0190.04250.0MALE
14Biscoe35.017.9190.03450.0FEMALE
16Torgersen34.621.1198.04400.0MALE
19Dream37.218.1178.03900.0MALE
21Biscoe40.517.9187.03200.0FEMALE
23Dream42.218.5180.03550.0FEMALE
30Dream39.221.1196.04150.0MALE
32Torgersen42.917.6196.04700.0MALE
38Dream41.117.5190.03900.0MALE
40Torgersen38.621.2191.03800.0MALE
42Biscoe35.516.2195.03350.0FEMALE
44Dream39.218.6190.04250.0MALE
45Torgersen35.215.9186.03050.0FEMALE
46Dream43.218.5192.04100.0MALE
49Biscoe39.617.7186.03500.0FEMALE
53Biscoe45.620.3191.04600.0MALE
58Torgersen40.916.8191.03700.0FEMALE
60Torgersen40.318.0195.03250.0FEMALE
62Dream36.018.5186.03100.0FEMALE
63Torgersen39.320.6190.03650.0MALE
\n", - "

25 rows × 6 columns

\n", - "
[146 rows x 6 columns in total]" - ], - "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", - "0 Biscoe 40.1 18.9 188.0 \n", - "1 Torgersen 39.1 18.7 181.0 \n", - "4 Biscoe 43.2 19.0 197.0 \n", - "6 Biscoe 41.3 21.1 195.0 \n", - "11 Dream 38.1 18.6 190.0 \n", - "13 Biscoe 37.8 20.0 190.0 \n", - "14 Biscoe 35.0 17.9 190.0 \n", - "16 Torgersen 34.6 21.1 198.0 \n", - "19 Dream 37.2 18.1 178.0 \n", - "21 Biscoe 40.5 17.9 187.0 \n", - "23 Dream 42.2 18.5 180.0 \n", - "30 Dream 39.2 21.1 196.0 \n", - "32 Torgersen 42.9 17.6 196.0 \n", - "38 Dream 41.1 17.5 190.0 \n", - "40 Torgersen 38.6 21.2 191.0 \n", - "42 Biscoe 35.5 16.2 195.0 \n", - "44 Dream 39.2 18.6 190.0 \n", - "45 Torgersen 35.2 15.9 186.0 \n", - "46 Dream 43.2 18.5 192.0 \n", - "49 Biscoe 39.6 17.7 186.0 \n", - "53 Biscoe 45.6 20.3 191.0 \n", - "58 Torgersen 40.9 16.8 191.0 \n", - "60 Torgersen 40.3 18.0 195.0 \n", - "62 Dream 36.0 18.5 186.0 \n", - "63 Torgersen 39.3 20.6 190.0 \n", - "\n", - " body_mass_g sex \n", - "0 4300.0 MALE \n", - "1 3750.0 MALE \n", - "4 4775.0 MALE \n", - "6 4400.0 MALE \n", - "11 3700.0 FEMALE \n", - "13 4250.0 MALE \n", - "14 3450.0 FEMALE \n", - "16 4400.0 MALE \n", - "19 3900.0 MALE \n", - "21 3200.0 FEMALE \n", - "23 3550.0 FEMALE \n", - "30 4150.0 MALE \n", - "32 4700.0 MALE \n", - "38 3900.0 MALE \n", - "40 3800.0 MALE \n", - "42 3350.0 FEMALE \n", - "44 4250.0 MALE \n", - "45 3050.0 FEMALE \n", - "46 4100.0 MALE \n", - "49 3500.0 FEMALE \n", - "53 4600.0 MALE \n", - "58 3700.0 FEMALE \n", - "60 3250.0 FEMALE \n", - "62 3100.0 FEMALE \n", - "63 3650.0 MALE \n", - "...\n", - "\n", - "[146 rows x 6 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# filter down to the data we want to analyze\n", "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", @@ -880,56 +147,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 43c8fdc2-0bc3-4607-a36d-5bee87c894d8 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 97e0c84d-aa6a-4197-9377-740d973ea44d is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 726b9a5e-48a1-4ced-ac34-fa028dcb2bf4 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from bigframes.ml.linear_model import LinearRegression\n", "\n", @@ -942,104 +162,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 28975567-2526-40f7-a7be-9dee6f782b4e is DONE. 9.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5c71d3d9-0e1c-45bd-866f-1f98f056260d is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 890767f7-a83b-469a-9f3e-abd5667f8202 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0223.87876378553.6016340.005614181.3309110.6239510.623951
\n", - "

1 rows × 6 columns

\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 223.878763 78553.601634 0.005614 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 181.330911 0.623951 0.623951 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# check how the model performed\n", "model.score(feature_columns, label_columns)" @@ -1047,103 +172,9 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d59df3e8-cf87-4340-a4c7-a27c3abfcc50 is DONE. 29.1 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5af493aa-96f9-434f-a101-ec855f4de694 is DONE. 8 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e2076bc3-3966-4c45-8265-c461756a7782 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e9cdfca7-30f6-4e93-95fb-244896e7c2ab is DONE. 16 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_body_mass_g
3345891.735118
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "334 5891.735118\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# use the model to predict the missing labels\n", "model.predict(missing_body_mass)" @@ -1159,32 +190,9 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Copy job cb4ef454-10df-4325-b9cb-6084df3ac9d5 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "LinearRegression(optimize_strategy='NORMAL_EQUATION')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# save the model to a permanent location in BigQuery, so we can use it in future sessions (and elsewhere in BQ)\n", "model.to_gbq(penguins_model, replace=True)" @@ -1199,20 +207,9 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression(optimize_strategy='NORMAL_EQUATION')" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# WARNING - until b/281709360 is fixed & pipeline is updated, pipelines will load as models,\n", "# and details of their transform steps will be lost (the loaded model will behave the same)\n", diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index 2114311e10..e2bc88ecae 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -174,7 +174,7 @@ "source": [ "# User defined function\n", "# https://www.codespeedy.com/find-nth-prime-number-in-python/\n", - "def nth_prime(n):\n", + "def nth_prime(n: int) -> int:\n", " prime_numbers = [2,3]\n", " i=3\n", " if(0 int:\n", " prime_numbers = [2,3]\n", " i=3\n", " if(0 str:\n", " if duration_minutes < 90:\n", " return \"short\"\n", @@ -466,7 +466,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False)\n", + "@bpd.remote_function(reuse=False, cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " if duration_minutes < 90:\n", " return DURATION_CATEGORY_SHORT\n", @@ -675,7 +675,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False)\n", + "@bpd.remote_function(reuse=False, cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " duration_hours = mymath.ceil(duration_minutes / 60)\n", " return f\"{duration_hours}h\"\n", @@ -886,7 +886,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False)\n", + "@bpd.remote_function(reuse=False, cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " duration_hours = get_hour_ceiling(duration_minutes)\n", " return f\"{duration_hours} hrs\"\n", @@ -1068,7 +1068,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False, packages=[\"cryptography\"])\n", + "@bpd.remote_function(reuse=False, packages=[\"cryptography\"], cloud_function_service_account=\"default\")\n", "def get_hash(input: str) -> str:\n", " from cryptography.fernet import Fernet\n", "\n", @@ -1271,7 +1271,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False, packages=[\"humanize\"])\n", + "@bpd.remote_function(reuse=False, packages=[\"humanize\"], cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " timedelta = dt.timedelta(minutes=duration_minutes)\n", " return humanize.naturaldelta(timedelta)\n", @@ -1442,7 +1442,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 78f0d27474..605f879bc7 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -286,7 +286,9 @@ "source": [ "@bpd.remote_function(packages=[\"anthropic[vertex]\", \"google-auth[requests]\"],\n", " max_batching_rows=1, \n", - " bigquery_connection=\"bigframes-dev.us-east5.bigframes-rf-conn\") # replace with your connection\n", + " bigquery_connection=\"bigframes-dev.us-east5.bigframes-rf-conn\", # replace with your connection\n", + " cloud_function_service_account=\"default\",\n", + ")\n", "def anthropic_transformer(message: str) -> str:\n", " from anthropic import AnthropicVertex\n", " client = AnthropicVertex(region=LOCATION, project_id=PROJECT)\n", diff --git a/noxfile.py b/noxfile.py index bcab34d0c0..bf9a435b0f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -29,7 +29,9 @@ import nox.sessions BLACK_VERSION = "black==22.3.0" +FLAKE8_VERSION = "flake8==7.1.2" ISORT_VERSION = "isort==5.12.0" +MYPY_VERSION = "mypy==1.15.0" # TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751) LATEST_FULLY_SUPPORTED_PYTHON = "3.12" @@ -67,7 +69,6 @@ UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", "asyncmock", - "freezegun", PYTEST_VERSION, "pytest-cov", "pytest-asyncio", @@ -135,7 +136,7 @@ def lint(session): Returns a failure if the linters find linting errors or sufficiently serious code quality issues. """ - session.install("flake8", BLACK_VERSION, ISORT_VERSION) + session.install(FLAKE8_VERSION, BLACK_VERSION, ISORT_VERSION) session.run( "isort", "--check", @@ -184,6 +185,14 @@ def lint_setup_py(session): session.install("docutils", "pygments") session.run("python", "setup.py", "check", "--restructuredtext", "--strict") + session.install("twine", "wheel") + shutil.rmtree("build", ignore_errors=True) + shutil.rmtree("dist", ignore_errors=True) + session.run("python", "setup.py", "sdist") + session.run( + "python", "-m", "twine", "check", *pathlib.Path("dist").glob("*.tar.gz") + ) + def install_unittest_dependencies(session, install_test_extra, *constraints): standard_deps = UNIT_TEST_STANDARD_DEPENDENCIES + UNIT_TEST_DEPENDENCIES @@ -256,7 +265,7 @@ def mypy(session): deps = ( set( [ - "mypy", + MYPY_VERSION, # TODO: update to latest pandas-stubs once we resolve bigframes issues. "pandas-stubs<=2.2.3.241126", "types-protobuf", @@ -773,7 +782,8 @@ def notebook(session: nox.Session): "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI. # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. - "notebooks/experimental/longer_ml_demo.ipynb", + "notebooks/experimental/ai_operators.ipynb", + "notebooks/experimental/multimodal_dataframe.ipynb", "notebooks/experimental/semantic_operators.ipynb", # The notebooks that are added for more use cases, such as backing a # blog post, which may take longer to execute and need not be diff --git a/owlbot.py b/owlbot.py index 159df04abd..fa5491ee20 100644 --- a/owlbot.py +++ b/owlbot.py @@ -42,6 +42,8 @@ s.move( templated_files, excludes=[ + # Need a combined LICENSE for all vendored packages. + "LICENSE", # Multi-processing note isn't relevant, as bigframes is responsible for # creating clients, not the end user. "docs/multiprocessing.rst", diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 9171ac78a4..5cba045ce4 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -76,6 +76,11 @@ def dataset_id_eu(bigquery_client: bigquery.Client, project_id: str) -> Iterator bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) +@pytest.fixture(scope="session") +def gcs_dst_bucket() -> str: + return "gs://bigframes_blob_test" + + @pytest.fixture def random_model_id( bigquery_client: bigquery.Client, project_id: str, dataset_id: str diff --git a/samples/snippets/create_multiple_timeseries_forecasting_model_test.py b/samples/snippets/create_multiple_timeseries_forecasting_model_test.py index b749c37d50..0ce38e1a85 100644 --- a/samples/snippets/create_multiple_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_multiple_timeseries_forecasting_model_test.py @@ -73,26 +73,103 @@ def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None: from bigframes.ml import forecasting import bigframes.pandas as bpd + model = forecasting.ARIMAPlus( + # To reduce the query runtime with the compromise of a potential slight + # drop in model quality, you could decrease the value of the + # auto_arima_max_order. This shrinks the search space of hyperparameter + # tuning in the auto.ARIMA algorithm. + auto_arima_max_order=5, + ) + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + # This query creates twelve time series models, one for each of the twelve + # Citi Bike start stations in the input data. If you remove this row + # filter, there would be 600+ time series to forecast. + df = df[df["start_station_name"].str.contains("Central Park")] + features = bpd.DataFrame( { - "num_trips": df.starttime, + "start_station_name": df["start_station_name"], + "num_trips": df["starttime"], "date": df["starttime"].dt.date, } ) - num_trips = features.groupby(["date"], as_index=False).count() - model = forecasting.ARIMAPlus() + num_trips = features.groupby( + ["start_station_name", "date"], + as_index=False, + ).count() X = num_trips["date"].to_frame() y = num_trips["num_trips"].to_frame() - model.fit(X, y) + model.fit( + X, + y, + # The input data that you want to get forecasts for, + # in this case the Citi Bike station, as represented by the + # start_station_name column. + id_col=num_trips["start_station_name"].to_frame(), + ) + # The model.fit() call above created a temporary model. # Use the to_gbq() method to write to a permanent location. - model.to_gbq( your_model_id, # For example: "bqml_tutorial.nyc_citibike_arima_model", replace=True, ) # [END bigquery_dataframes_bqml_arima_multiple_step_3_fit] + + # [START bigquery_dataframes_bqml_arima_multiple_step_4_evaluate] + # Evaluate the time series models by using the summary() function. The summary() + # function shows you the evaluation metrics of all the candidate models evaluated + # during the process of automatic hyperparameter tuning. + summary = model.summary() + print(summary.peek()) + + # Expected output: + # start_station_name non_seasonal_p non_seasonal_d non_seasonal_q has_drift log_likelihood AIC variance ... + # 1 Central Park West & W 72 St 0 1 5 False -1966.449243 3944.898487 1215.689281 ... + # 8 Central Park W & W 96 St 0 0 5 False -274.459923 562.919847 655.776577 ... + # 9 Central Park West & W 102 St 0 0 0 False -226.639918 457.279835 258.83582 ... + # 11 Central Park West & W 76 St 1 1 2 False -1700.456924 3408.913848 383.254161 ... + # 4 Grand Army Plaza & Central Park S 0 1 5 False -5507.553498 11027.106996 624.138741 ... + # [END bigquery_dataframes_bqml_arima_multiple_step_4_evaluate] + + # [START bigquery_dataframes_bqml_arima_multiple_step_5_coefficients] + coef = model.coef_ + print(coef.peek()) + + # Expected output: + # start_station_name ar_coefficients ma_coefficients intercept_or_drift + # 5 Central Park West & W 68 St [] [-0.41014089 0.21979212 -0.59854213 -0.251438... 0.0 + # 6 Central Park S & 6 Ave [] [-0.71488957 -0.36835772 0.61008532 0.183290... 0.0 + # 0 Central Park West & W 85 St [] [-0.39270166 -0.74494638 0.76432596 0.489146... 0.0 + # 3 W 82 St & Central Park West [-0.50219511 -0.64820817] [-0.20665325 0.67683137 -0.68108631] 0.0 + # 11 W 106 St & Central Park West [-0.70442887 -0.66885553 -0.25030325 -0.34160669] [] 0.0 + # [END bigquery_dataframes_bqml_arima_multiple_step_5_coefficients] + + # [START bigquery_dataframes_bqml_arima_multiple_step_6_forecast] + prediction = model.predict(horizon=3, confidence_level=0.9) + + print(prediction.peek()) + # Expected output: + # forecast_timestamp start_station_name forecast_value standard_error confidence_level ... + # 4 2016-10-01 00:00:00+00:00 Central Park S & 6 Ave 302.377201 32.572948 0.9 ... + # 14 2016-10-02 00:00:00+00:00 Central Park North & Adam Clayton Powell Blvd 263.917567 45.284082 0.9 ... + # 1 2016-09-25 00:00:00+00:00 Central Park West & W 85 St 189.574706 39.874856 0.9 ... + # 20 2016-10-02 00:00:00+00:00 Central Park West & W 72 St 175.474862 40.940794 0.9 ... + # 12 2016-10-01 00:00:00+00:00 W 106 St & Central Park West 63.88163 18.088868 0.9 ... + # [END bigquery_dataframes_bqml_arima_multiple_step_6_forecast] + # [START bigquery_dataframes_bqml_arima_multiple_step_7_explain] + explain = model.predict_explain(horizon=3, confidence_level=0.9) + + print(explain.peek(5)) + # Expected output: + # time_series_timestamp start_station_name time_series_type time_series_data time_series_adjusted_data standard_error confidence_level prediction_interval_lower_bound prediction_interval_upper_bound trend seasonal_period_yearly seasonal_period_quarterly seasonal_period_monthly seasonal_period_weekly seasonal_period_daily holiday_effect spikes_and_dips step_changes residual + # 0 2013-07-01 00:00:00+00:00 Central Park S & 6 Ave history 69.0 154.168527 32.572948 0.0 35.477484 -28.402102 0.0 -85.168527 147.093145 + # 1 2013-07-01 00:00:00+00:00 Grand Army Plaza & Central Park S history 79.0 79.0 24.982769 0.0 43.46428 -30.01599 0.0 0.0 65.55171 + # 2 2013-07-02 00:00:00+00:00 Central Park S & 6 Ave history 180.0 204.045651 32.572948 147.093045 72.498327 -15.545721 0.0 -85.168527 61.122876 + # 3 2013-07-02 00:00:00+00:00 Grand Army Plaza & Central Park S history 129.0 99.556269 24.982769 65.551665 45.836432 -11.831828 0.0 0.0 29.443731 + # 4 2013-07-03 00:00:00+00:00 Central Park S & 6 Ave history 115.0 205.968236 32.572948 191.32754 59.220766 -44.580071 0.0 -85.168527 -5.799709 + # [END bigquery_dataframes_bqml_arima_multiple_step_7_explain] diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py deleted file mode 100644 index 5cdcd6d3a7..0000000000 --- a/samples/snippets/gen_ai_model_test.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_llm_model() -> None: - # Determine project id, in this case prefer the one set in the environment - # variable GOOGLE_CLOUD_PROJECT (if any) - import os - - PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") - REGION = "us" - CONN_NAME = "bigframes-default-connection" - - # [START bigquery_dataframes_gen_ai_model] - from bigframes.ml.llm import PaLM2TextGenerator - import bigframes.pandas as bpd - - # Create the LLM model - session = bpd.get_global_session() - connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" - model = PaLM2TextGenerator(session=session, connection_name=connection) - - df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") - - # Prepare the prompts and send them to the LLM model for prediction - df_prompt_prefix = "Generate Pandas sample code for DataFrame." - df_prompt = df_prompt_prefix + df_api["API"] - - # Predict using the model - df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024) - # [END bigquery_dataframes_gen_ai_model] - assert df_pred["ml_generate_text_llm_result"] is not None - assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py new file mode 100644 index 0000000000..e5236317e2 --- /dev/null +++ b/samples/snippets/multimodal_test.py @@ -0,0 +1,118 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: + # destination folder must be in a GCS bucket that the BQ connection service account (default or user provided) has write access to. + dst_bucket = gcs_dst_bucket + # [START bigquery_dataframes_multimodal_dataframe_create] + import bigframes + + # Flag to enable the feature + bigframes.options.experiments.blob = True + + import bigframes.pandas as bpd + + # Create blob columns from wildcard path. + df_image = bpd.from_glob_path( + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*", name="image" + ) + # Other ways are: from string uri column + # df = bpd.DataFrame({"uri": ["gs:///", "gs:///"]}) + # df["blob_col"] = df["uri"].str.to_blob() + + # From an existing object table + # df = bpd.read_gbq_object_table("", name="blob_col") + + # Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame + df_image = df_image.head(5) + df_image + # [END bigquery_dataframes_multimodal_dataframe_create] + + # [START bigquery_dataframes_multimodal_dataframe_merge] + # Combine unstructured data with structured data + df_image["author"] = ["alice", "bob", "bob", "alice", "bob"] # type: ignore + df_image["content_type"] = df_image["image"].blob.content_type() + df_image["size"] = df_image["image"].blob.size() + df_image["updated"] = df_image["image"].blob.updated() + df_image + + # Filter images and display, you can also display audio and video types + df_image[df_image["author"] == "alice"]["image"].blob.display() + # [END bigquery_dataframes_multimodal_dataframe_merge] + + # [START bigquery_dataframes_multimodal_dataframe_image_transform] + df_image["blurred"] = df_image["image"].blob.image_blur( + (20, 20), dst=f"{dst_bucket}/image_blur_transformed/" + ) + df_image["resized"] = df_image["image"].blob.image_resize( + (300, 200), dst=f"{dst_bucket}/image_resize_transformed/" + ) + df_image["normalized"] = df_image["image"].blob.image_normalize( + alpha=50.0, + beta=150.0, + norm_type="minmax", + dst=f"{dst_bucket}/image_normalize_transformed/", + ) + + # You can also chain functions together + df_image["blur_resized"] = df_image["blurred"].blob.image_resize( + (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/" + ) + df_image + # [END bigquery_dataframes_multimodal_dataframe_image_transform] + + # [START bigquery_dataframes_multimodal_dataframe_ai] + from bigframes.ml import llm + + gemini = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-002") + + # Deal with first 2 images as example + df_image = df_image.head(2) + + # Ask the same question on the images + df_image = df_image.head(2) + answer = gemini.predict(df_image, prompt=["what item is it?", df_image["image"]]) + answer[["ml_generate_text_llm_result", "image"]] + + # Ask different questions + df_image["question"] = [ # type: ignore + "what item is it?", + "what color is the picture?", + ] + answer_alt = gemini.predict( + df_image, prompt=[df_image["question"], df_image["image"]] + ) + answer_alt[["ml_generate_text_llm_result", "image"]] + + # Generate embeddings on images + embed_model = llm.MultimodalEmbeddingGenerator() + embeddings = embed_model.predict(df_image["image"]) + embeddings + # [END bigquery_dataframes_multimodal_dataframe_ai] + + # [START bigquery_dataframes_multimodal_dataframe_pdf_chunk] + # PDF chunking + df_pdf = bpd.from_glob_path( + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf" + ) + df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk() + chunked = df_pdf["chunked"].explode() + chunked + # [END bigquery_dataframes_multimodal_dataframe_pdf_chunk] + assert df_image is not None + assert answer is not None + assert answer_alt is not None + assert embeddings is not None + assert chunked is not None diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index c35daf35fc..3a7031ef89 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -47,9 +47,8 @@ def run_remote_function_and_read_gbq_function(project_id: str) -> None: # of the penguins, which is a real number, into a category, which is a # string. @bpd.remote_function( - float, - str, reuse=False, + cloud_function_service_account="default", ) def get_bucket(num: float) -> str: if not num: @@ -91,10 +90,9 @@ def get_bucket(num: float) -> str: # as a remote function. The custom function in this example has external # package dependency, which can be specified via `packages` parameter. @bpd.remote_function( - str, - str, reuse=False, packages=["cryptography"], + cloud_function_service_account="default", ) def get_hash(input: str) -> str: from cryptography.fernet import Fernet diff --git a/samples/snippets/text_generation_test.py b/samples/snippets/text_generation_test.py deleted file mode 100644 index c4df1dde3b..0000000000 --- a/samples/snippets/text_generation_test.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_llm_text_generation() -> None: - # Determine project id, in this case prefer the one set in the environment - # variable GOOGLE_CLOUD_PROJECT (if any) - import os - - PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") - LOCATION = "US" - - # [START bigquery_dataframes_generate_text_tutorial_create_remote_model] - import bigframes - from bigframes.ml.llm import PaLM2TextGenerator - - bigframes.options.bigquery.project = PROJECT_ID - bigframes.options.bigquery.location = LOCATION - - model = PaLM2TextGenerator() - # [END bigquery_dataframes_generate_text_tutorial_create_remote_model] - assert model is not None - - # [START bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction] - import bigframes.pandas as bpd - - df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5) - df_prompt_prefix = "Extract the key words from the text below: " - df_prompt = df_prompt_prefix + df["review"] - - # Predict using the model - df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100) - df_pred.peek(5) - # [END bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction] - # peek() is used to show a preview of the results. If the output - # of this sample changes, also update the screenshot for the associated - # tutorial on cloud.google.com. - assert df_pred["ml_generate_text_llm_result"] is not None - assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None - - # [START bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis] - import bigframes.pandas as bpd - - df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5) - df_prompt_prefix = "perform sentiment analysis on the following text, return one the following categories: positive, negative: " - df_prompt = df_prompt_prefix + df["review"] - - # Predict using the model - df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100) - df_pred.peek(5) - # [END bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis] - # peek() is used to show a preview of the results. If the output - # of this sample changes, also update the screenshot for the associated - # tutorial on cloud.google.com. - - assert df_pred["ml_generate_text_llm_result"] is not None - assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/scratch/.gitignore b/scratch/.gitignore new file mode 100644 index 0000000000..b813ccd98e --- /dev/null +++ b/scratch/.gitignore @@ -0,0 +1,2 @@ +# Ignore all files in this directory. +* diff --git a/setup.py b/setup.py index 34e013c9a3..edc77e11b6 100644 --- a/setup.py +++ b/setup.py @@ -41,20 +41,19 @@ "google-auth >=2.15.0,<3.0dev", "google-cloud-bigtable >=2.24.0", "google-cloud-pubsub >=2.21.4", - "google-cloud-bigquery[bqstorage,pandas] >=3.18.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - # Upper bound due to no windows build for 1.1.2 - "jellyfish >=0.8.9,<1.1.2", "numpy >=1.24.0", "pandas >=1.5.3", - "pandas-gbq >=0.26.0", + "pandas-gbq >=0.26.1", "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", + "shapely >=1.8.5", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", @@ -63,7 +62,6 @@ "db-dtypes >=1.4.2", # For vendored ibis-framework. "atpublic>=2.3,<6", - "parsy>=2,<3", "python-dateutil>=2.8.2,<3", "pytz>=2022.7", "toolz>=0.11,<2", @@ -79,10 +77,10 @@ # Packages required for basic development flow. "dev": [ "pytest", - "pytest-mock", "pre-commit", "nox", "google-cloud-testutils", + "freezegun", ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) @@ -117,6 +115,7 @@ version=version_id, description=description, long_description=readme, + long_description_content_type="text/x-rst", author="Google LLC", author_email="bigframes-feedback@google.com", license="Apache 2.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8c7c69efa7..dff245d176 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -6,20 +6,20 @@ geopandas==0.12.2 google-auth==2.15.0 google-cloud-bigtable==2.24.0 google-cloud-pubsub==2.21.4 -google-cloud-bigquery==3.18.0 +google-cloud-bigquery==3.31.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 -jellyfish==0.8.9 numpy==1.24.0 pandas==1.5.3 -pandas-gbq==0.26.0 +pandas-gbq==0.26.1 pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 +shapely==1.8.5 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 @@ -28,7 +28,6 @@ matplotlib==3.7.1 db-dtypes==1.4.2 # For vendored ibis-framework. atpublic==2.3 -parsy==2.0 python-dateutil==2.8.2 pytz==2022.7 toolz==0.11 diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl new file mode 100644 index 0000000000..b7cd350d08 --- /dev/null +++ b/tests/data/ratings.jsonl @@ -0,0 +1,20 @@ +{"user_id": 1, "item_id": 2, "rating": 4.0} +{"user_id": 1, "item_id": 5, "rating": 3.0} +{"user_id": 2, "item_id": 1, "rating": 5.0} +{"user_id": 2, "item_id": 3, "rating": 2.0} +{"user_id": 3, "item_id": 4, "rating": 4.5} +{"user_id": 3, "item_id": 7, "rating": 3.5} +{"user_id": 4, "item_id": 2, "rating": 1.0} +{"user_id": 4, "item_id": 8, "rating": 5.0} +{"user_id": 5, "item_id": 3, "rating": 4.0} +{"user_id": 5, "item_id": 9, "rating": 2.5} +{"user_id": 6, "item_id": 1, "rating": 3.0} +{"user_id": 6, "item_id": 6, "rating": 4.5} +{"user_id": 7, "item_id": 5, "rating": 5.0} +{"user_id": 7, "item_id": 10, "rating": 1.5} +{"user_id": 8, "item_id": 4, "rating": 2.0} +{"user_id": 8, "item_id": 7, "rating": 4.0} +{"user_id": 9, "item_id": 2, "rating": 3.5} +{"user_id": 9, "item_id": 9, "rating": 5.0} +{"user_id": 10, "item_id": 3, "rating": 4.5} +{"user_id": 10, "item_id": 8, "rating": 2.5} diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json new file mode 100644 index 0000000000..9fd0101ec8 --- /dev/null +++ b/tests/data/ratings_schema.json @@ -0,0 +1,17 @@ +[ + { + "mode": "NULLABLE", + "name": "user_id", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "item_id", + "type": "INT64" + }, + { + "mode": "NULLABLE", + "name": "rating", + "type": "FLOAT" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 398ee8a6b2..19f2a79b65 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -185,8 +185,27 @@ def session_tokyo(tokyo_location: str) -> Generator[bigframes.Session, None, Non @pytest.fixture(scope="session") -def bq_connection(bigquery_client: bigquery.Client) -> str: - return f"{bigquery_client.project}.{bigquery_client.location}.bigframes-rf-conn" +def test_session() -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions( + client_endpoints_override={ + "bqclient": "https://test-bigquery.sandbox.google.com", + "bqconnectionclient": "test-bigqueryconnection.sandbox.googleapis.com", + "bqstoragereadclient": "test-bigquerystorage-grpc.sandbox.googleapis.com", + }, + ) + session = bigframes.Session(context=context) + yield session + session.close() + + +@pytest.fixture(scope="session") +def bq_connection_name() -> str: + return "bigframes-rf-conn" + + +@pytest.fixture(scope="session") +def bq_connection(bigquery_client: bigquery.Client, bq_connection_name: str) -> str: + return f"{bigquery_client.project}.{bigquery_client.location}.{bq_connection_name}" @pytest.fixture(scope="session", autouse=True) @@ -315,6 +334,7 @@ def load_test_data_tables( ("repeated", "repeated_schema.json", "repeated.jsonl"), ("json", "json_schema.json", "json.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), + ("ratings", "ratings_schema.json", "ratings.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), ("hockey_players", "hockey_players.json", "hockey_players.jsonl"), ("matrix_2by3", "matrix_2by3.json", "matrix_2by3.jsonl"), @@ -411,6 +431,11 @@ def penguins_table_id(test_data_tables) -> str: return test_data_tables["penguins"] +@pytest.fixture(scope="session") +def ratings_table_id(test_data_tables) -> str: + return test_data_tables["ratings"] + + @pytest.fixture(scope="session") def urban_areas_table_id(test_data_tables) -> str: return test_data_tables["urban_areas"] @@ -460,7 +485,7 @@ def nested_structs_df( @pytest.fixture(scope="session") -def nested_structs_pandas_df() -> pd.DataFrame: +def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame: """pd.DataFrame pointing at test data.""" df = pd.read_json( @@ -468,6 +493,7 @@ def nested_structs_pandas_df() -> pd.DataFrame: lines=True, ) df = df.set_index("id") + df["person"] = df["person"].astype(nested_structs_pandas_type) return df @@ -763,6 +789,14 @@ def penguins_df_null_index( return unordered_session.read_gbq(penguins_table_id) +@pytest.fixture(scope="session") +def ratings_df_default_index( + ratings_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq(ratings_table_id) + + @pytest.fixture(scope="session") def time_series_df_default_index( time_series_table_id: str, session: bigframes.Session @@ -890,8 +924,8 @@ def llm_text_pandas_df(): @pytest.fixture(scope="session") -def llm_text_df(session, llm_text_pandas_df): - return session.read_pandas(llm_text_pandas_df) +def llm_text_df(test_session, llm_text_pandas_df): + return test_session.read_pandas(llm_text_pandas_df) @pytest.fixture(scope="session") @@ -1464,13 +1498,14 @@ def images_uris() -> list[str]: @pytest.fixture(scope="session") def images_mm_df( - images_gcs_path, session: bigframes.Session, bq_connection: str + images_uris, test_session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: bigframes.options.experiments.blob = True - return session.from_glob_path( - images_gcs_path, name="blob_col", connection=bq_connection + blob_series = bpd.Series(images_uris, session=test_session).str.to_blob( + connection=bq_connection ) + return blob_series.rename("blob_col").to_frame() @pytest.fixture() @@ -1489,8 +1524,10 @@ def pdf_gcs_path() -> str: @pytest.fixture(scope="session") def pdf_mm_df( - pdf_gcs_path, session: bigframes.Session, bq_connection: str + pdf_gcs_path, test_session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: bigframes.options.experiments.blob = True - return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection) + return test_session.from_glob_path( + pdf_gcs_path, name="pdf", connection=bq_connection + ) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index d428299a96..a2c3f2b85f 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -25,12 +25,6 @@ from bigframes import dtypes import bigframes.pandas as bpd -# TODO(shobs): restore these tests after the managed udf cleanup issue is -# resolved in the test project -pytestmark = pytest.mark.skip( - reason="temporarily disable to debug managed udf cleanup in the test project" -) - @pytest.fixture(scope="function") def images_output_folder() -> Generator[str, None, None]: @@ -61,11 +55,11 @@ def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - session: bigframes.Session, + test_session: bigframes.Session, ): bigframes.options.experiments.blob = True - series = bpd.Series(images_output_uris, session=session).str.to_blob( + series = bpd.Series(images_output_uris, session=test_session).str.to_blob( connection=bq_connection ) @@ -135,11 +129,11 @@ def test_blob_image_resize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - session: bigframes.Session, + test_session: bigframes.Session, ): bigframes.options.experiments.blob = True - series = bpd.Series(images_output_uris, session=session).str.to_blob( + series = bpd.Series(images_output_uris, session=test_session).str.to_blob( connection=bq_connection ) @@ -211,11 +205,11 @@ def test_blob_image_normalize_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, images_output_uris: list[str], - session: bigframes.Session, + test_session: bigframes.Session, ): bigframes.options.experiments.blob = True - series = bpd.Series(images_output_uris, session=session).str.to_blob( + series = bpd.Series(images_output_uris, session=test_session).str.to_blob( connection=bq_connection ) diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index eabafd96fb..a15bce83ad 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -16,17 +16,14 @@ import pandas import pyarrow import pytest +import test_utils.prefixer import bigframes import bigframes.exceptions as bfe import bigframes.pandas as bpd from tests.system.utils import cleanup_function_assets -# TODO(shobs): restore these tests after the managed udf cleanup issue is -# resolved in the test project -pytestmark = pytest.mark.skip( - reason="temporarily disable to debug managed udf cleanup in the test project" -) +prefixer = test_utils.prefixer.Prefixer("bigframes", "") def test_managed_function_multiply_with_ibis( @@ -43,6 +40,7 @@ def test_managed_function_multiply_with_ibis( input_types=[int, int], output_type=int, dataset=dataset_id, + name=prefixer.create_prefix(), ) def multiply(x, y): return x * y @@ -93,6 +91,7 @@ def test_managed_function_stringify_with_ibis( input_types=[int], output_type=str, dataset=dataset_id, + name=prefixer.create_prefix(), ) def stringify(x): return f"I got {x}" @@ -129,7 +128,10 @@ def stringify(x): def test_managed_function_array_output(session, scalars_dfs, dataset_id): try: - @session.udf(dataset=dataset_id) + @session.udf( + dataset=dataset_id, + name=prefixer.create_prefix(), + ) def featurize(x: int) -> list[float]: return [float(i) for i in [x, x + 1, x + 2]] @@ -166,13 +168,10 @@ def featurize(x: int) -> list[float]: cleanup_function_assets(featurize, session.bqclient, ignore_failures=False) -def test_managed_function_series_apply( - session, - scalars_dfs, -): +def test_managed_function_series_apply(session, dataset_id, scalars_dfs): try: - @session.udf() + @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) def foo(x: int) -> bytes: return bytes(abs(x)) @@ -223,13 +222,14 @@ def foo(x: int) -> bytes: def test_managed_function_series_apply_array_output( session, + dataset_id, scalars_dfs, ): try: with pytest.warns(bfe.PreviewWarning, match="udf is in preview."): - @session.udf() + @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) def foo_list(x: int) -> list[float]: return [float(abs(x)), float(abs(x) + 1)] @@ -252,7 +252,7 @@ def foo_list(x: int) -> list[float]: cleanup_function_assets(foo_list, session.bqclient, ignore_failures=False) -def test_managed_function_series_combine(session, scalars_dfs): +def test_managed_function_series_combine(session, dataset_id, scalars_dfs): try: # This function is deliberately written to not work with NA input. def add(x: int, y: int) -> int: @@ -267,7 +267,9 @@ def add(x: int, y: int) -> int: # make sure there are NA values in the test column. assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]]) - add_managed_func = session.udf()(add) + add_managed_func = session.udf( + dataset=dataset_id, name=prefixer.create_prefix() + )(add) # with nulls in the series the managed function application would fail. with pytest.raises( @@ -310,7 +312,7 @@ def add(x: int, y: int) -> int: ) -def test_managed_function_series_combine_array_output(session, scalars_dfs): +def test_managed_function_series_combine_array_output(session, dataset_id, scalars_dfs): try: def add_list(x: int, y: int) -> list[int]: @@ -325,7 +327,9 @@ def add_list(x: int, y: int) -> list[int]: # Make sure there are NA values in the test column. assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]]) - add_list_managed_func = session.udf()(add_list) + add_list_managed_func = session.udf( + dataset=dataset_id, name=prefixer.create_prefix() + )(add_list) # After filtering out nulls the managed function application should work # similar to pandas. @@ -373,7 +377,7 @@ def add_list(x: int, y: int) -> list[int]: ) -def test_managed_function_dataframe_map(session, scalars_dfs): +def test_managed_function_dataframe_map(session, dataset_id, scalars_dfs): try: def add_one(x): @@ -382,6 +386,8 @@ def add_one(x): mf_add_one = session.udf( input_types=[int], output_type=int, + dataset=dataset_id, + name=prefixer.create_prefix(), )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -407,9 +413,7 @@ def add_one(x): cleanup_function_assets(mf_add_one, session.bqclient, ignore_failures=False) -def test_managed_function_dataframe_map_array_output( - session, scalars_dfs, dataset_id_permanent -): +def test_managed_function_dataframe_map_array_output(session, scalars_dfs, dataset_id): try: def add_one_list(x): @@ -418,6 +422,8 @@ def add_one_list(x): mf_add_one_list = session.udf( input_types=[int], output_type=list[int], + dataset=dataset_id, + name=prefixer.create_prefix(), )(add_one_list) scalars_df, scalars_pandas_df = scalars_dfs @@ -448,7 +454,7 @@ def add_one_list(x): ) -def test_managed_function_dataframe_apply_axis_1(session, scalars_dfs): +def test_managed_function_dataframe_apply_axis_1(session, dataset_id, scalars_dfs): try: scalars_df, scalars_pandas_df = scalars_dfs series = scalars_df["int64_too"] @@ -460,6 +466,8 @@ def add_ints(x, y): add_ints_mf = session.udf( input_types=[int, int], output_type=int, + dataset=dataset_id, + name=prefixer.create_prefix(), )(add_ints) assert add_ints_mf.bigframes_bigquery_function # type: ignore @@ -484,7 +492,7 @@ def add_ints(x, y): cleanup_function_assets(add_ints_mf, session.bqclient, ignore_failures=False) -def test_managed_function_dataframe_apply_axis_1_array_output(session): +def test_managed_function_dataframe_apply_axis_1_array_output(session, dataset_id): bf_df = bigframes.dataframe.DataFrame( { "Id": [1, 2, 3], @@ -504,7 +512,12 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session): try: - @session.udf(input_types=[int, float, str], output_type=list[str]) + @session.udf( + input_types=[int, float, str], + output_type=list[str], + dataset=dataset_id, + name=prefixer.create_prefix(), + ) def foo(x, y, z): return [str(x), str(y), z] @@ -587,3 +600,45 @@ def foo(x, y, z): finally: # Clean up the gcp assets created for the managed function. cleanup_function_assets(foo, session.bqclient, ignore_failures=False) + + +@pytest.mark.parametrize( + "connection_fixture", + [ + "bq_connection_name", + "bq_connection", + ], +) +def test_managed_function_with_connection( + session, scalars_dfs, dataset_id, request, connection_fixture +): + try: + bigquery_connection = request.getfixturevalue(connection_fixture) + + @session.udf( + bigquery_connection=bigquery_connection, + dataset=dataset_id, + name=prefixer.create_prefix(), + ) + def foo(x: int) -> int: + return x + 10 + + # Function should still work normally. + assert foo(-2) == 8 + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result_col = scalars_df["int64_too"].apply(foo) + bf_result = ( + scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_result_col = scalars_pandas_df["int64_too"].apply(foo) + pd_result = ( + scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(foo, session.bqclient, ignore_failures=False) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 1e5e7ede26..426813b0ff 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -17,11 +17,9 @@ import inspect import math # must keep this at top level to test udf referring global import import os.path -import re import shutil import tempfile import textwrap -import typing import warnings import google.api_core.exceptions @@ -111,11 +109,14 @@ def test_remote_function_multiply_with_ibis( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int, int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def multiply(x, y): return x * y @@ -165,11 +166,14 @@ def test_remote_function_stringify_with_ibis( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], str, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def stringify(x): return f"I got {x}" @@ -213,11 +217,14 @@ def func(x, y): return x * abs(y % 4) remote_func = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [str, int], str, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(func) scalars_df, scalars_pandas_df = scalars_dfs @@ -250,11 +257,14 @@ def func(x, y): return [len(x), abs(y % 4)] remote_func = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [str, int], list[int], dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(func) scalars_df, scalars_pandas_df = scalars_dfs @@ -284,11 +294,14 @@ def test_remote_function_decorator_with_bigframes_series( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def square(x): return x * x @@ -330,11 +343,14 @@ def add_one(x): return x + 1 remote_add_one = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -380,7 +396,14 @@ def test_remote_function_input_types(session, scalars_dfs, input_types): def add_one(x): return x + 1 - remote_add_one = session.remote_function(input_types, int, reuse=False)(add_one) + remote_add_one = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. + input_types, + int, + reuse=False, + cloud_function_service_account="default", + )(add_one) assert remote_add_one.input_dtypes == (bigframes.dtypes.INT_DTYPE,) scalars_df, scalars_pandas_df = scalars_dfs @@ -406,11 +429,14 @@ def test_remote_function_explicit_dataset_not_created( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, - dataset_id_not_created, - bq_cf_connection, + dataset=dataset_id_not_created, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def square(x): return x * x @@ -459,11 +485,14 @@ def sign(num): return NO_SIGN remote_sign = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(sign) scalars_df, scalars_pandas_df = scalars_dfs @@ -506,11 +535,14 @@ def circumference(radius): return 2 * mymath.pi * radius remote_circumference = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [float], float, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(circumference) scalars_df, scalars_pandas_df = scalars_dfs @@ -555,11 +587,12 @@ def find_team(num): return _team_pi remote_find_team = session.remote_function( - [float], - str, - dataset_id, - bq_cf_connection, + input_types=[float], + output_type=str, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(find_team) scalars_df, scalars_pandas_df = scalars_dfs @@ -627,11 +660,12 @@ def add_one(x): # The first time both the cloud function and the bq remote function don't # exist and would be created remote_add_one = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=True, + cloud_function_service_account="default", )(add_one_uniq) # There should have been excactly one cloud function created at this point @@ -697,11 +731,12 @@ def inner_test(): # exist even though the remote function exists, and goes ahead and recreates # the cloud function remote_add_one = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=True, + cloud_function_service_account="default", )(add_one_uniq) # There should be excactly one cloud function again @@ -743,11 +778,12 @@ def is_odd(num): return flag is_odd_remote = session.remote_function( - [int], - bool, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=bool, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(is_odd) scalars_df, scalars_pandas_df = scalars_dfs @@ -783,11 +819,12 @@ def is_odd(num): return flag is_odd_remote = session.remote_function( - [int], - bool, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=bool, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(is_odd) scalars_df, scalars_pandas_df = scalars_dfs @@ -817,11 +854,12 @@ def test_remote_udf_lambda(session, scalars_dfs, dataset_id, bq_cf_connection): add_one_lambda = lambda x: x + 1 # noqa: E731 add_one_lambda_remote = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(add_one_lambda) scalars_df, scalars_pandas_df = scalars_dfs @@ -872,12 +910,13 @@ def square(x): # Create the remote function with the name provided explicitly square_remote = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, name=rf_name, + cloud_function_service_account="default", )(square) # The remote function should reflect the explicitly provided name @@ -925,12 +964,13 @@ def pd_np_foo(x): # Create the remote function with the name provided explicitly pd_np_foo_remote = session.remote_function( - [int], - float, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=float, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, packages=["numpy", "pandas >= 2.0.0"], + cloud_function_service_account="default", )(pd_np_foo) # The behavior of the created remote function should be as expected @@ -1005,11 +1045,12 @@ def test_internal(rf, udf): # Create a new remote function with the name provided explicitly square_remote1 = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, name=rf_name, + cloud_function_service_account="default", )(square_uniq) # The remote function should reflect the explicitly provided name @@ -1030,11 +1071,12 @@ def test_internal(rf, udf): # explicitly. Since reuse is True by default, the previously created # remote function with the same name will be reused. square_remote2 = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, name=rf_name, + cloud_function_service_account="default", )(square_uniq) # The new remote function should still reflect the explicitly provided name @@ -1074,11 +1116,12 @@ def plusone(x): # created remote function with the same name should not be reused since # this time it is a different user code. plusone_remote = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, name=rf_name, + cloud_function_service_account="default", )(plusone_uniq) # The new remote function should still reflect the explicitly provided name @@ -1139,7 +1182,13 @@ def test_remote_function_via_session_context_connection_setter( # unique dataset_id, even though the cloud function would be reused, the bq # remote function would still be created, making use of the bq connection # set in the BigQueryOptions above. - @session.remote_function([int], int, dataset=dataset_id, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1174,7 +1223,13 @@ def square(x): def test_remote_function_default_connection(session, scalars_dfs, dataset_id): try: - @session.remote_function([int], int, dataset=dataset_id, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1209,7 +1264,13 @@ def square(x): def test_remote_function_runtime_error(session, scalars_dfs, dataset_id): try: - @session.remote_function([int], int, dataset=dataset_id, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1233,7 +1294,12 @@ def test_remote_function_anonymous_dataset(session, scalars_dfs): # function in the bigframes session's anonymous dataset. Use reuse=False # param to make sure parallel instances of the test don't step over each # other due to the common anonymous dataset. - @session.remote_function([int], int, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1290,14 +1356,27 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): try: + # TODO(shobs): Figure out why the default ingress setting + # (internal-only) does not work here @rf_session.remote_function( - [int], int, reuse=False, cloud_function_service_account=gcf_service_account + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account=gcf_service_account, + cloud_function_ingress_settings="all", ) def square_num(x): if x is None: return x return x * x + # assert that the GCF is created with the intended SA + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num.bigframes_cloud_function + ) + assert gcf.service_config.service_account_email == gcf_service_account + + # assert that the function works as expected on data scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] @@ -1309,12 +1388,6 @@ def square_num(x): pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - # Assert that the GCF is created with the intended SA - gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num.bigframes_cloud_function - ) - assert gcf.service_config.service_account_email == gcf_service_account finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -1322,38 +1395,12 @@ def square_num(x): ) -@pytest.mark.parametrize( - ("remote_function_args"), - [ - pytest.param( - {}, - id="no-set", - ), - pytest.param( - {"cloud_function_service_account": None}, - id="set-none", - ), - ], -) -def test_remote_function_warns_default_cloud_function_service_account( - session, remote_function_args -): - with pytest.warns(FutureWarning) as record: - session.remote_function(**remote_function_args) - - len( - [ - warn - for warn in record - if re.search( - ( - "You have not explicitly set a user-managed.*Using the default Compute Engine.*service account" - ), - typing.cast(FutureWarning, warn.message).args[0], - re.DOTALL, - ) - ] - ) == 1 +def test_remote_function_throws_none_cloud_function_service_account(session): + with pytest.raises( + ValueError, + match='^You must provide a user managed cloud_function_service_account, or "default" if you would like to let the default service account be used.$', + ): + session.remote_function(cloud_function_service_account=None) @pytest.mark.flaky(retries=2, delay=120) @@ -1378,9 +1425,10 @@ def test_remote_function_with_gcf_cmek(): try: @session.remote_function( - [int], - int, + input_types=[int], + output_type=int, reuse=False, + cloud_function_service_account="default", cloud_function_kms_key_name=cmek, cloud_function_docker_repository=docker_repository, ) @@ -1452,10 +1500,24 @@ def square_num(x): return x return x * x + # TODO(shobs): See if the test vpc can be configured to make this flow + # work with the default ingress setting (internal-only) square_num_remote = rf_session.remote_function( - [int], int, reuse=False, cloud_function_vpc_connector=gcf_vpc_connector + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + cloud_function_vpc_connector=gcf_vpc_connector, + cloud_function_ingress_settings="all", )(square_num) + # assert that the GCF is created with the intended vpc connector + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num_remote.bigframes_cloud_function + ) + assert gcf.service_config.vpc_connector == gcf_vpc_connector + + # assert that the function works as expected on data scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] @@ -1467,12 +1529,6 @@ def square_num(x): pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - # Assert that the GCF is created with the intended vpc connector - gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num_remote.bigframes_cloud_function - ) - assert gcf.service_config.vpc_connector == gcf_vpc_connector finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -1495,7 +1551,11 @@ def square(x): return x * x square_remote = session.remote_function( - [int], int, reuse=False, max_batching_rows=max_batching_rows + input_types=[int], + output_type=int, + reuse=False, + max_batching_rows=max_batching_rows, + cloud_function_service_account="default", )(square) bq_routine = session.bqclient.get_routine( @@ -1534,7 +1594,11 @@ def square(x): return x * x square_remote = session.remote_function( - [int], int, reuse=False, **timeout_args + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + **timeout_args, )(square) # Assert that the GCF is created with the intended maximum timeout @@ -1560,7 +1624,13 @@ def square(x): def test_remote_function_gcf_timeout_max_supported_exceeded(session): with pytest.raises(ValueError): - @session.remote_function([int], int, reuse=False, cloud_function_timeout=1201) + @session.remote_function( + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + cloud_function_timeout=1201, + ) def square(x): return x * x @@ -1583,7 +1653,11 @@ def square(x): return x * x square_remote = session.remote_function( - [int], int, reuse=False, **max_instances_args + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + **max_instances_args, )(square) # Assert that the GCF is created with the intended max instance count @@ -1632,7 +1706,10 @@ def serialize_row(row): ) serialize_row_remote = session.remote_function( - bigframes.series.Series, str, reuse=False + input_types=bigframes.series.Series, + output_type=str, + reuse=False, + cloud_function_service_account="default", )(serialize_row) assert getattr(serialize_row_remote, "is_row_processor") @@ -1678,7 +1755,10 @@ def analyze(row): ) analyze_remote = session.remote_function( - bigframes.series.Series, str, reuse=False + input_types=bigframes.series.Series, + output_type=str, + reuse=False, + cloud_function_service_account="default", )(analyze) assert getattr(analyze_remote, "is_row_processor") @@ -1799,7 +1879,10 @@ def serialize_row(row): ) serialize_row_remote = session.remote_function( - bigframes.series.Series, str, reuse=False + input_types=bigframes.series.Series, + output_type=str, + reuse=False, + cloud_function_service_account="default", )(serialize_row) assert getattr(serialize_row_remote, "is_row_processor") @@ -1856,7 +1939,10 @@ def float_parser(row): return float(row["text"]) float_parser_remote = session.remote_function( - bigframes.series.Series, float, reuse=False + input_types=bigframes.series.Series, + output_type=float, + reuse=False, + cloud_function_service_account="default", )(float_parser) assert getattr(float_parser_remote, "is_row_processor") @@ -1901,7 +1987,9 @@ def test_remote_function_gcf_memory( def square(x: int) -> int: return x * x - square_remote = session.remote_function(reuse=False, **memory_mib_args)(square) + square_remote = session.remote_function( + reuse=False, cloud_function_service_account="default", **memory_mib_args + )(square) # Assert that the GCF is created with the intended memory gcf = session.cloudfunctionsclient.get_function( @@ -1936,7 +2024,11 @@ def test_remote_function_gcf_memory_unsupported(session, memory_mib): match="Invalid value specified for container memory", ): - @session.remote_function(reuse=False, cloud_function_memory_mib=memory_mib) + @session.remote_function( + reuse=False, + cloud_function_service_account="default", + cloud_function_memory_mib=memory_mib, + ) def square(x: int) -> int: return x * x @@ -1947,7 +2039,7 @@ def test_remote_function_unnamed_removed_w_session_cleanup(): session = bigframes.connect() # create an unnamed remote function in the session - @session.remote_function(reuse=False) + @session.remote_function(reuse=False, cloud_function_service_account="default") def foo(x: int) -> int: return x + 1 @@ -1989,7 +2081,9 @@ def test_remote_function_named_perists_w_session_cleanup(): name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() # create an unnamed remote function in the session - @session.remote_function(reuse=False, name=name) + @session.remote_function( + reuse=False, name=name, cloud_function_service_account="default" + ) def foo(x: int) -> int: return x + 1 @@ -2030,14 +2124,16 @@ def test_remote_function_clean_up_by_session_id(): # without it, and later confirm that the former is deleted when the session # is cleaned up by session id, but the latter remains ## unnamed - @session.remote_function(reuse=False) + @session.remote_function(reuse=False, cloud_function_service_account="default") def foo_unnamed(x: int) -> int: return x + 1 ## named rf_name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() - @session.remote_function(reuse=False, name=rf_name) + @session.remote_function( + reuse=False, name=rf_name, cloud_function_service_account="default" + ) def foo_named(x: int) -> int: return x + 2 @@ -2104,7 +2200,12 @@ def test_df_apply_axis_1_multiple_params(session): try: - @session.remote_function([int, float, str], str, reuse=False) + @session.remote_function( + input_types=[int, float, str], + output_type=str, + reuse=False, + cloud_function_service_account="default", + ) def foo(x, y, z): return f"I got {x}, {y} and {z}" @@ -2179,7 +2280,12 @@ def test_df_apply_axis_1_multiple_params_array_output(session): try: - @session.remote_function([int, float, str], list[str], reuse=False) + @session.remote_function( + input_types=[int, float, str], + output_type=list[str], + reuse=False, + cloud_function_service_account="default", + ) def foo(x, y, z): return [str(x), str(y), z] @@ -2259,7 +2365,12 @@ def test_df_apply_axis_1_single_param_non_series(session): try: - @session.remote_function([int], str, reuse=False) + @session.remote_function( + input_types=[int], + output_type=str, + reuse=False, + cloud_function_service_account="default", + ) def foo(x): return f"I got {x}" @@ -2313,7 +2424,7 @@ def test_df_apply_axis_1_array_output(session, scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs try: - @session.remote_function(reuse=False) + @session.remote_function(reuse=False, cloud_function_service_account="default") def generate_stats(row: pandas.Series) -> list[int]: import pandas as pd @@ -2356,13 +2467,13 @@ def generate_stats(row: pandas.Series) -> list[int]: [ pytest.param( {}, - functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, - True, + functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY, + False, id="no-set", ), pytest.param( {"cloud_function_ingress_settings": None}, - functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, + functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY, True, id="set-none", ), @@ -2402,17 +2513,16 @@ def square(x: int) -> int: return x * x square_remote = session.remote_function( - reuse=False, **ingress_settings_args + reuse=False, + cloud_function_service_account="default", + **ingress_settings_args, )(square) default_ingress_setting_warnings = [ warn for warn in record - if isinstance(warn.message, FutureWarning) - and "`cloud_function_ingress_settings` are set to 'all' by default" - in warn.message.args[0] - and "will change to 'internal-only' for enhanced security in future" - in warn.message.args[0] + if isinstance(warn.message, UserWarning) + and "The `cloud_function_ingress_settings` is being set to 'internal-only' by default." ] assert len(default_ingress_setting_warnings) == ( 1 if expect_default_ingress_setting_warning else 0 @@ -2443,7 +2553,11 @@ def test_remote_function_ingress_settings_unsupported(session): ValueError, match="'unknown' not one of the supported ingress settings values" ): - @session.remote_function(reuse=False, cloud_function_ingress_settings="unknown") + @session.remote_function( + reuse=False, + cloud_function_service_account="default", + cloud_function_ingress_settings="unknown", + ) def square(x: int) -> int: return x * x @@ -2475,6 +2589,7 @@ def add_one(x: int) -> int: dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(add_one) temporary_bigquery_remote_function = ( @@ -2552,6 +2667,7 @@ def add_one(x: int) -> int: bigquery_connection=bq_cf_connection, reuse=False, name=name, + cloud_function_service_account="default", )(add_one) persistent_bigquery_remote_function = ( @@ -2619,6 +2735,7 @@ def test_remote_function_array_output( dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def featurize(x: int) -> list[array_dtype]: # type: ignore return [array_dtype(i) for i in [x, x + 1, x + 2]] @@ -2657,6 +2774,7 @@ def test_remote_function_array_output_partial_ordering_mode( dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def featurize(x: float) -> list[float]: # type: ignore return [x, x + 1, x + 2] @@ -2698,6 +2816,7 @@ def test_remote_function_array_output_multiindex( dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def featurize(x: int) -> list[float]: return [x, x + 0.5, x + 0.33] @@ -2720,3 +2839,33 @@ def featurize(x: int) -> list[float]: cleanup_function_assets( featurize, session.bqclient, session.cloudfunctionsclient ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_connection_path_format( + session, scalars_dfs, dataset_id, bq_cf_connection +): + try: + + @session.remote_function( + dataset=dataset_id, + bigquery_connection=f"projects/{session.bqclient.project}/locations/{session._location}/connections/{bq_cf_connection}", + reuse=False, + cloud_function_service_account="default", + ) + def foo(x: int) -> int: + return x + 1 + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_too"] + bf_result = bf_int64_col.apply(foo).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_too"] + pd_result = pd_int64_col.apply(foo) + + # ignore any dtype disparity + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 49aa985189..d1a5f9f2aa 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -163,3 +163,49 @@ def test_decomposition_configure_fit_load_none_component( in reloaded_model._bqml_model.model_name ) assert reloaded_model.n_components == 7 + + +def test_decomposition_mf_configure_fit_load( + session, ratings_df_default_index, dataset_id +): + model = decomposition.MatrixFactorization( + num_factors=6, + feedback_type="explicit", + user_col="user_id", + item_col="item_id", + rating_col="rating", + l2_reg=9.83, + ) + + model.fit(ratings_df_default_index) + + reloaded_model = model.to_gbq( + f"{dataset_id}.temp_configured_mf_model", replace=True + ) + + new_ratings = session.read_pandas( + pd.DataFrame( + { + "user_id": ["11", "12", "13"], + "item_id": [1, 2, 3], + "rating": [1.0, 2.0, 3.0], + } + ) + ) + + reloaded_model.score(new_ratings) + + result = reloaded_model.predict(new_ratings).to_pandas() + + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_mf_model" + in reloaded_model._bqml_model.model_name + ) + assert result is not None + assert reloaded_model.feedback_type == "explicit" + assert reloaded_model.num_factors == 6 + assert reloaded_model.user_col == "user_id" + assert reloaded_model.item_col == "item_id" + assert reloaded_model.rating_col == "rating" + assert reloaded_model.l2_reg == 9.83 diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 96215c5e47..be98902007 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -222,8 +222,8 @@ def test_unordered_mode_linear_regression_configure_fit_score_predict( start_execution_count = end_execution_count result = model.score(X_train, y_train).to_pandas() end_execution_count = df._block._expr.session._metrics.execution_count - # The score function and to_pandas each initiate one query. - assert end_execution_count - start_execution_count == 2 + # The score function and to_pandas reuse same result. + assert end_execution_count - start_execution_count == 1 utils.check_pandas_df_schema_and_index( result, columns=utils.ML_REGRESSION_METRICS, index=1 diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py index 76a7001fe3..ee9daa4e31 100644 --- a/tests/system/large/test_dataframe_io.py +++ b/tests/system/large/test_dataframe_io.py @@ -44,7 +44,7 @@ def test_to_pandas_batches_override_global_option( page_size=500, max_results=1500, allow_large_results=True ) ) - assert len(w) == 2 + assert len(w) == 1 assert issubclass(w[0].category, FutureWarning) assert "The query result size has exceeded 10 GB." in str(w[0].message) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index 7801f5dada..d4428c1f95 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -13,9 +13,11 @@ # limitations under the License. import typing -import warnings from google.cloud import bigquery +from google.cloud.bigquery_storage import types as bqstorage_types +import pandas +import pandas.testing import pytest import bigframes @@ -41,6 +43,7 @@ def _assert_bq_execution_location( assert typing.cast(bigquery.QueryJob, df.query_job).location == expected_location + # Ensure operation involving BQ client suceeds result = ( df[["name", "number"]] .groupby("name") @@ -53,6 +56,35 @@ def _assert_bq_execution_location( typing.cast(bigquery.QueryJob, result.query_job).location == expected_location ) + expected_result = pandas.DataFrame( + {"number": [444, 222]}, index=pandas.Index(["aaa", "bbb"], name="name") + ) + pandas.testing.assert_frame_equal( + expected_result, result.to_pandas(), check_dtype=False, check_index_type=False + ) + + # Ensure BQ Storage Read client operation succceeds + table = result.query_job.destination + requested_session = bqstorage_types.ReadSession( # type: ignore[attr-defined] + table=f"projects/{table.project}/datasets/{table.dataset_id}/tables/{table.table_id}", + data_format=bqstorage_types.DataFormat.ARROW, # type: ignore[attr-defined] + ) + read_session = session.bqstoragereadclient.create_read_session( + parent=f"projects/{table.project}", + read_session=requested_session, + max_stream_count=1, + ) + reader = session.bqstoragereadclient.read_rows(read_session.streams[0].name) + frames = [] + for message in reader.rows().pages: + frames.append(message.to_dataframe()) + read_dataframe = pandas.concat(frames) + # normalize before comparing since we lost some of the bigframes column + # naming abtractions in the direct read of the destination table + read_dataframe = read_dataframe.set_index("name") + read_dataframe.columns = result.columns + pandas.testing.assert_frame_equal(expected_result, read_dataframe) + def test_bq_location_default(): session = bigframes.Session() @@ -119,22 +151,14 @@ def test_bq_location_non_canonical(set_location, resolved_location): sorted(bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_rep_endpoints(bigquery_location): - with warnings.catch_warnings(record=True) as record: - warnings.simplefilter("always") - session = bigframes.Session( - context=bigframes.BigQueryOptions( - location=bigquery_location, use_regional_endpoints=True - ) - ) - assert ( - len([warn for warn in record if isinstance(warn.message, FutureWarning)]) - == 0 + session = bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True ) + ) - # Verify that location and endpoints are correctly set for the BigQuery API + # Verify that location and endpoint is correctly set for the BigQuery API # client - # TODO(shobs): Figure out if the same can be verified for the other API - # clients. assert session.bqclient.location == bigquery_location assert ( session.bqclient._connection.API_BASE_URL @@ -143,36 +167,52 @@ def test_bq_rep_endpoints(bigquery_location): ) ) + # Verify that endpoint is correctly set for the BigQuery Storage API client + # TODO(shobs): Figure out if we can verify that location is set in the + # BigQuery Storage API client. + assert ( + session.bqstoragereadclient.api_endpoint + == f"bigquerystorage.{bigquery_location}.rep.googleapis.com" + ) + # assert that bigframes session honors the location _assert_bq_execution_location(session) +def test_clients_provider_no_location(): + with pytest.raises(ValueError, match="Must set location to use regional endpoints"): + bigframes.session.clients.ClientsProvider(use_regional_endpoints=True) + + @pytest.mark.parametrize( "bigquery_location", # Sort the set to avoid nondeterminism. - sorted(bigframes.constants.LEP_ENABLED_BIGQUERY_LOCATIONS), + sorted(bigframes.constants.REP_NOT_ENABLED_BIGQUERY_LOCATIONS), ) -def test_bq_lep_endpoints(bigquery_location): - # We are not testing BigFrames Session for LEP endpoints because it involves - # query execution using the endpoint, which requires the project to be - # allowlisted for LEP access. We could hardcode one project which is - # allowlisted but then not every open source developer will have access to - # that. Let's rely on just creating the clients for LEP. - with pytest.warns(FutureWarning) as record: - clients_provider = bigframes.session.clients.ClientsProvider( +def test_clients_provider_use_regional_endpoints_non_rep_locations(bigquery_location): + with pytest.raises( + ValueError, + match=f"not .*available in the location {bigquery_location}", + ): + bigframes.session.clients.ClientsProvider( location=bigquery_location, use_regional_endpoints=True ) - assert len(record) == 1 - assert bigquery_location in typing.cast(Warning, record[0].message).args[0] - # Verify that location and endpoints are correctly set for the BigQuery API - # client - # TODO(shobs): Figure out if the same can be verified for the other API - # clients. - assert clients_provider.bqclient.location == bigquery_location - assert ( - clients_provider.bqclient._connection.API_BASE_URL - == "https://{location}-bigquery.googleapis.com".format( - location=bigquery_location + +@pytest.mark.parametrize( + "bigquery_location", + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.REP_NOT_ENABLED_BIGQUERY_LOCATIONS), +) +def test_session_init_fails_to_use_regional_endpoints_non_rep_endpoints( + bigquery_location, +): + with pytest.raises( + ValueError, + match=f"not .*available in the location {bigquery_location}", + ): + bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True + ) ) - ) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index e117cf0327..1dac8c851e 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -72,12 +72,12 @@ def test_close(session: bigframes.Session): ) full_id_1 = bigframes.session._io.bigquery.create_temp_table( session.bqclient, - session._temp_storage_manager.allocate_temp_table(), + session._anon_dataset_manager.allocate_temp_table(), expiration, ) full_id_2 = bigframes.session._io.bigquery.create_temp_table( session.bqclient, - session._temp_storage_manager.allocate_temp_table(), + session._anon_dataset_manager.allocate_temp_table(), expiration, ) @@ -112,12 +112,12 @@ def test_clean_up_by_session_id(): ) bigframes.session._io.bigquery.create_temp_table( session.bqclient, - session._temp_storage_manager.allocate_temp_table(), + session._anon_dataset_manager.allocate_temp_table(), expiration, ) bigframes.session._io.bigquery.create_temp_table( session.bqclient, - session._temp_storage_manager.allocate_temp_table(), + session._anon_dataset_manager.allocate_temp_table(), expiration, ) @@ -157,10 +157,11 @@ def test_clean_up_via_context_manager(session_creator): with session_creator() as session: bqclient = session.bqclient - full_id_1 = session._temp_storage_manager.allocate_and_create_temp_table( + full_id_1 = session._anon_dataset_manager.create_temp_table( [bigquery.SchemaField("a", "INT64")], cluster_cols=[] ) - full_id_2 = session._temp_storage_manager.allocate_and_create_temp_table( + assert session._session_resource_manager is not None + full_id_2 = session._session_resource_manager.create_temp_table( [bigquery.SchemaField("b", "STRING")], cluster_cols=["b"] ) diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index 472be3d2ad..ee49c2703e 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -75,17 +75,19 @@ def test_index_repr_large_table(): def test_to_pandas_batches_large_table(): - df = bpd.read_gbq("load_testing.scalars_1tb") + df = bpd.read_gbq("load_testing.scalars_100gb") _, expected_column_count = df.shape # download only a few batches, since 1tb would be too much - iterable = df.to_pandas_batches(page_size=500, max_results=1500) + iterable = df.to_pandas_batches( + page_size=500, max_results=1500, allow_large_results=True + ) # use page size since client library doesn't support # streaming only part of the dataframe via bqstorage for pdf in iterable: batch_row_count, batch_column_count = pdf.shape assert batch_column_count == expected_column_count - assert batch_row_count > 0 + assert 0 < batch_row_count <= 500 @pytest.mark.skip(reason="See if it caused kokoro build aborted.") diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 57fc878643..00f690ed54 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -36,11 +36,7 @@ def test_json_set_at_json_path(json_path, expected_json): actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) @pytest.mark.parametrize( @@ -60,11 +56,7 @@ def test_json_set_at_json_value_type(json_value, expected_json): actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_set_w_more_pairs(): @@ -77,11 +69,7 @@ def test_json_set_w_more_pairs(): expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_set_w_invalid_value_type(): @@ -114,11 +102,7 @@ def test_json_extract_from_json(): actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_from_string(): @@ -129,11 +113,7 @@ def test_json_extract_from_string(): actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_w_invalid_series_type(): @@ -165,11 +145,7 @@ def test_json_extract_array_from_json(): expected.index.name = None expected.name = None - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_array_from_json_strings(): @@ -183,11 +159,7 @@ def test_json_extract_array_from_json_strings(): dtype=pd.ArrowDtype(pa.list_(pa.string())), ) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_array_from_json_array_strings(): @@ -201,11 +173,7 @@ def test_json_extract_array_from_json_array_strings(): dtype=pd.ArrowDtype(pa.list_(pa.string())), ) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_array_w_invalid_series_type(): @@ -219,11 +187,7 @@ def test_json_extract_string_array_from_json_strings(): actual = bbq.json_extract_string_array(s, "$.a") expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_string_array_from_array_strings(): @@ -231,11 +195,7 @@ def test_json_extract_string_array_from_array_strings(): actual = bbq.json_extract_string_array(s) expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_string_array_as_float_array_from_array_strings(): @@ -243,11 +203,7 @@ def test_json_extract_string_array_as_float_array_from_array_strings(): actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE) expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_string_array_w_invalid_series_type(): diff --git a/tests/system/small/bigquery/test_struct.py b/tests/system/small/bigquery/test_struct.py index 4970964edd..58c822f642 100644 --- a/tests/system/small/bigquery/test_struct.py +++ b/tests/system/small/bigquery/test_struct.py @@ -53,10 +53,9 @@ def test_struct_from_dataframe(columns_arg): srs = series.Series( columns_arg, ) - # Use allow_large_results=True, due to b/403028465 pd.testing.assert_series_equal( - srs.to_pandas(allow_large_results=True), - bbq.struct(srs.struct.explode()).to_pandas(allow_large_results=True), + srs.to_pandas(), + bbq.struct(srs.struct.explode()).to_pandas(), check_index_type=False, check_dtype=False, ) diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index c30f7674af..c496e5d631 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -19,11 +19,11 @@ def test_blob_create_from_uri_str( - bq_connection: str, session: bigframes.Session, images_uris + bq_connection: str, test_session: bigframes.Session, images_uris ): bigframes.options.experiments.blob = True - uri_series = bpd.Series(images_uris, session=session) + uri_series = bpd.Series(images_uris, session=test_session) blob_series = uri_series.str.to_blob(connection=bq_connection) pd_blob_df = blob_series.struct.explode().to_pandas() @@ -42,14 +42,21 @@ def test_blob_create_from_uri_str( def test_blob_create_from_glob_path( - bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris + bq_connection: str, test_session: bigframes.Session, images_gcs_path, images_uris ): bigframes.options.experiments.blob = True - blob_df = session.from_glob_path( + blob_df = test_session.from_glob_path( images_gcs_path, connection=bq_connection, name="blob_col" ) - pd_blob_df = blob_df["blob_col"].struct.explode().to_pandas() + pd_blob_df = ( + blob_df["blob_col"] + .struct.explode() + .to_pandas() + .sort_values("uri") + .reset_index(drop=True) + ) + expected_df = pd.DataFrame( { "uri": images_uris, @@ -65,14 +72,20 @@ def test_blob_create_from_glob_path( def test_blob_create_read_gbq_object_table( - bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris + bq_connection: str, test_session: bigframes.Session, images_gcs_path, images_uris ): bigframes.options.experiments.blob = True - obj_table = session._create_object_table(images_gcs_path, bq_connection) + obj_table = test_session._create_object_table(images_gcs_path, bq_connection) - blob_df = session.read_gbq_object_table(obj_table, name="blob_col") - pd_blob_df = blob_df["blob_col"].struct.explode().to_pandas() + blob_df = test_session.read_gbq_object_table(obj_table, name="blob_col") + pd_blob_df = ( + blob_df["blob_col"] + .struct.explode() + .to_pandas() + .sort_values("uri") + .reset_index(drop=True) + ) expected_df = pd.DataFrame( { "uri": images_uris, diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py index c7704ec86d..767dbe37b7 100644 --- a/tests/system/small/blob/test_properties.py +++ b/tests/system/small/blob/test_properties.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import db_dtypes # type: ignore import pandas as pd import bigframes +import bigframes.dtypes as dtypes import bigframes.pandas as bpd @@ -55,33 +55,28 @@ def test_blob_version(images_mm_df: bpd.DataFrame): def test_blob_metadata(images_mm_df: bpd.DataFrame): - # allow_large_result=False incompatible with json b/401630655 - with bigframes.option_context( - "bigquery.allow_large_results", True, "experiments.blob", True - ): + with bigframes.option_context("experiments.blob", True): actual = images_mm_df["blob_col"].blob.metadata().to_pandas() expected = pd.Series( [ - { - "content_type": "image/jpeg", - "md5_hash": "e130ad042261a1883cd2cc06831cf748", - "size": 338390, - "updated": 1739574332000000, - }, - { - "content_type": "image/jpeg", - "md5_hash": "e2ae3191ff2b809fd0935f01a537c650", - "size": 43333, - "updated": 1739574332000000, - }, + ( + '{"content_type":"image/jpeg",' + '"md5_hash":"e130ad042261a1883cd2cc06831cf748",' + '"size":338390,' + '"updated":1739574332000000}' + ), + ( + '{"content_type":"image/jpeg",' + '"md5_hash":"e2ae3191ff2b809fd0935f01a537c650",' + '"size":43333,' + '"updated":1739574332000000}' + ), ], name="metadata", - dtype=db_dtypes.JSONDtype(), - ) - - pd.testing.assert_series_equal( - actual, expected, check_dtype=False, check_index_type=False + dtype=dtypes.JSON_DTYPE, ) + expected.index = expected.index.astype(dtypes.INT_DTYPE) + pd.testing.assert_series_equal(actual, expected) def test_blob_content_type(images_mm_df: bpd.DataFrame): diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 0af7f4e42e..51e0459014 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -25,6 +25,7 @@ import test_utils.prefixer import bigframes +import bigframes.clients import bigframes.dtypes import bigframes.exceptions from bigframes.functions import _utils as bff_utils @@ -93,6 +94,11 @@ def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: return session +def get_bq_connection_id_path_format(connection_id_dot_format): + fields = connection_id_dot_format.split(".") + return f"projects/{fields[0]}/locations/{fields[1]}/connections/{fields[2]}" + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, @@ -107,8 +113,8 @@ def square(x): return x * x square = bff.remote_function( - int, - int, + input_types=int, + output_type=int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, @@ -118,6 +124,7 @@ def square(x): # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -154,11 +161,8 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_location_specified( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_location( + session, scalars_dfs, dataset_id_permanent, bq_cf_connection_location, @@ -167,17 +171,15 @@ def square(x): return x * x square = bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, + input_types=int, + output_type=int, + session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -208,11 +210,8 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_location_mismatched( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_location_mismatched( + session, dataset_id_permanent, bq_cf_connection_location_mismatched, ): @@ -221,31 +220,41 @@ def square(x): # connection doesn't match the location of the dataset. return x * x # pragma: NO COVER - with pytest.raises( - ValueError, - match=re.escape("The location does not match BigQuery connection location:"), - ): - bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, - dataset=dataset_id_permanent, - bigquery_connection=bq_cf_connection_location_mismatched, - # See e2e tests for tests that actually deploy the Cloud Function. - reuse=True, - name=get_function_name(square), - )(square) + bq_cf_connection_location_mismatched_path_fmt = get_bq_connection_id_path_format( + bigframes.clients.get_canonical_bq_connection_id( + bq_cf_connection_location_mismatched, + session.bqclient.project, + session._location, + ) + ) + connection_ids = [ + bq_cf_connection_location_mismatched, + bq_cf_connection_location_mismatched_path_fmt, + ] + + for connection_id in connection_ids: + with pytest.raises( + ValueError, + match=re.escape( + "The location does not match BigQuery connection location:" + ), + ): + bff.remote_function( + input_types=int, + output_type=int, + session=session, + dataset=dataset_id_permanent, + bigquery_connection=connection_id, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + name=get_function_name(square), + cloud_function_service_account="default", + )(square) @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_location_project_specified( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_location_project( + session, scalars_dfs, dataset_id_permanent, bq_cf_connection_location_project, @@ -254,17 +263,15 @@ def square(x): return x * x square = bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, + input_types=int, + output_type=int, + session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -295,11 +302,8 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_project_mismatched( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_project_mismatched( + session, dataset_id_permanent, bq_cf_connection_location_project_mismatched, ): @@ -308,25 +312,38 @@ def square(x): # connection doesn't match the project of the dataset. return x * x # pragma: NO COVER - with pytest.raises( - ValueError, - match=re.escape( - "The project_id does not match BigQuery connection gcp_project_id:" - ), - ): - bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, - dataset=dataset_id_permanent, - bigquery_connection=bq_cf_connection_location_project_mismatched, - # See e2e tests for tests that actually deploy the Cloud Function. - reuse=True, - name=get_function_name(square), - )(square) + bq_cf_connection_location_project_mismatched_path_fmt = ( + get_bq_connection_id_path_format( + bigframes.clients.get_canonical_bq_connection_id( + bq_cf_connection_location_project_mismatched, + session.bqclient.project, + session._location, + ) + ) + ) + connection_ids = [ + bq_cf_connection_location_project_mismatched, + bq_cf_connection_location_project_mismatched_path_fmt, + ] + + for connection_id in connection_ids: + with pytest.raises( + ValueError, + match=re.escape( + "The project_id does not match BigQuery connection gcp_project_id:" + ), + ): + bff.remote_function( + input_types=int, + output_type=int, + session=session, + dataset=dataset_id_permanent, + bigquery_connection=connection_id, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + name=get_function_name(square), + cloud_function_service_account="default", + )(square) @pytest.mark.flaky(retries=2, delay=120) @@ -337,11 +354,12 @@ def square(x): return x * x square = bff.remote_function( - int, - int, + input_types=int, + output_type=int, session=session_with_bq_connection, dataset=dataset_id_permanent, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -386,7 +404,11 @@ def square(x): # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. square = session_with_bq_connection.remote_function( - int, int, dataset_id_permanent, name=get_function_name(square) + input_types=int, + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -424,13 +446,14 @@ def square(x): return x * x square = session.remote_function( - int, - int, - dataset_id_permanent, - bq_cf_connection, + input_types=int, + output_type=int, + dataset=dataset_id_permanent, + bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -468,7 +491,11 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_function_name(add_one) + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -499,7 +526,11 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_function_name(add_one) + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -530,7 +561,11 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_function_name(add_one) + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -576,6 +611,7 @@ def bytes_to_hex(mybytes: bytes) -> bytes: dataset=dataset_id_permanent, name=get_function_name(bytes_to_hex, package_requirements=packages), packages=packages, + cloud_function_service_account="default", )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() @@ -618,10 +654,11 @@ def add_one(x): return x + 1 # pragma: NO COVER session.remote_function( - [int], - int, + input_types=[int], + output_type=int, dataset=dataset_id_permanent, name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) @@ -651,8 +688,8 @@ def square1(x): return x * x square1 = bff.remote_function( - [int], - int, + input_types=[int], + output_type=int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, dataset=dataset_id_permanent, @@ -661,6 +698,7 @@ def square1(x): bigquery_connection=bq_cf_connection, reuse=True, name=get_function_name(square1), + cloud_function_service_account="default", )(square1) # Function should still work normally. @@ -1135,10 +1173,11 @@ def add_ints(row): match="input_types=Series is in preview.", ): add_ints_remote = session.remote_function( - bigframes.series.Series, - int, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=int, + dataset=dataset_id_permanent, name=get_function_name(add_ints, is_row_processor=True), + cloud_function_service_account="default", )(add_ints) assert add_ints_remote.bigframes_remote_function # type: ignore assert add_ints_remote.bigframes_bigquery_function # type: ignore @@ -1187,10 +1226,11 @@ def add_ints(row): return row["int64_col"] + row["int64_too"] add_ints_remote = session.remote_function( - bigframes.series.Series, - int, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=int, + dataset=dataset_id_permanent, name=get_function_name(add_ints, is_row_processor=True), + cloud_function_service_account="default", )(add_ints) bf_result = ( @@ -1226,10 +1266,11 @@ def add_numbers(row): return row["x"] + row["y"] add_numbers_remote = session.remote_function( - bigframes.series.Series, - float, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=float, + dataset=dataset_id_permanent, name=get_function_name(add_numbers, is_row_processor=True), + cloud_function_service_account="default", )(add_numbers) bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() @@ -1279,10 +1320,11 @@ def echo_len(row): return len(row) echo_len_remote = session.remote_function( - bigframes.series.Series, - float, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=float, + dataset=dataset_id_permanent, name=get_function_name(echo_len, is_row_processor=True), + cloud_function_service_account="default", )(echo_len) for column in columns_with_not_supported_dtypes: @@ -1315,7 +1357,9 @@ def should_mask(name: str) -> bool: assert "name" in inspect.signature(should_mask).parameters should_mask = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(should_mask) + dataset=dataset_id_permanent, + name=get_function_name(should_mask), + cloud_function_service_account="default", )(should_mask) s = bigframes.series.Series(["Alice", "Bob", "Caroline"]) @@ -1374,7 +1418,9 @@ def is_odd(x: int) -> bool: # create a remote function is_odd_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(is_odd) + dataset=dataset_id_permanent, + name=get_function_name(is_odd), + cloud_function_service_account="default", )(is_odd) # with nulls in the series the remote function application would fail @@ -1424,7 +1470,9 @@ def add(x: int, y: int) -> int: # create a remote function add_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(add) + dataset=dataset_id_permanent, + name=get_function_name(add), + cloud_function_service_account="default", )(add) # with nulls in the series the remote function application would fail @@ -1477,7 +1525,9 @@ def add(x: int, y: int, z: float) -> float: # create a remote function add_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(add) + dataset=dataset_id_permanent, + name=get_function_name(add), + cloud_function_service_account="default", )(add) # pandas does not support nary functions, so let's create a proxy function @@ -1533,6 +1583,7 @@ def is_long_duration(minutes: int) -> bool: is_long_duration = unordered_session.remote_function( dataset=dataset_id_permanent, name=get_function_name(is_long_duration), + cloud_function_service_account="default", )(is_long_duration) method = getattr(df["duration_minutes"], method) @@ -1551,7 +1602,9 @@ def combiner(x: int, y: int) -> int: return x combiner = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(combiner) + dataset=dataset_id_permanent, + name=get_function_name(combiner), + cloud_function_service_account="default", )(combiner) df = scalars_df_index[["int64_col", "int64_too", "float64_col", "string_col"]] @@ -1567,7 +1620,9 @@ def processor(x: int, y: int, z: float, w: str) -> str: return f"I got x={x}, y={y}, z={z} and w={w}" processor = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(processor) + dataset=dataset_id_permanent, + name=get_function_name(processor), + cloud_function_service_account="default", )(processor) df = scalars_df_index[["int64_col", "int64_too", "float64_col", "string_col"]] diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index b075817b07..18f3ff2675 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -17,6 +17,7 @@ import bigframes_vendored.constants as constants import geopandas # type: ignore from geopandas.array import GeometryDtype # type:ignore +import geopandas.testing # type:ignore import google.api_core.exceptions import pandas as pd import pytest @@ -188,16 +189,17 @@ def test_geo_boundary(): LineString([(0, 0), (1, 1), (0, 1)]), Point(0, 1), ], + index=pd.Index([0, 1, 2, 3, 4], dtype="Int64"), ) bf_result = bf_s.geo.boundary.to_pandas() pd_result = pd_s.boundary - pd.testing.assert_series_equal( + geopandas.testing.assert_geoseries_equal( bf_result, pd_result, check_series_type=False, - check_index=False, + check_index_type=False, ) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 1843da41d7..d56874719e 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -202,64 +202,6 @@ def xgboost_iris_df(session, xgboost_iris_pandas_df): return session.read_pandas(xgboost_iris_pandas_df) -@pytest.fixture(scope="session") -def bqml_palm2_text_generator_model(session, bq_connection) -> core.BqmlModel: - options = { - "remote_service_type": "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1", - } - return globals.bqml_model_factory().create_remote_model( - session=session, connection_name=bq_connection, options=options - ) - - -@pytest.fixture(scope="session") -def palm2_text_generator_model(session, bq_connection) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) - - -@pytest.fixture(scope="session") -def palm2_text_generator_32k_model(session, bq_connection) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator( - model_name="text-bison-32k", session=session, connection_name=bq_connection - ) - - -@pytest.fixture(scope="function") -def ephemera_palm2_text_generator_model( - session, bq_connection -) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) - - -@pytest.fixture(scope="session") -def palm2_embedding_generator_model( - session, bq_connection -) -> llm.PaLM2TextEmbeddingGenerator: - return llm.PaLM2TextEmbeddingGenerator( - session=session, connection_name=bq_connection - ) - - -@pytest.fixture(scope="session") -def palm2_embedding_generator_model_002( - session, bq_connection -) -> llm.PaLM2TextEmbeddingGenerator: - return llm.PaLM2TextEmbeddingGenerator( - version="002", session=session, connection_name=bq_connection - ) - - -@pytest.fixture(scope="session") -def palm2_embedding_generator_multilingual_model( - session, bq_connection -) -> llm.PaLM2TextEmbeddingGenerator: - return llm.PaLM2TextEmbeddingGenerator( - model_name="textembedding-gecko-multilingual", - session=session, - connection_name=bq_connection, - ) - - @pytest.fixture(scope="session") def linear_remote_model_params() -> dict: # Pre-deployed endpoint of linear reg model in Vertex. diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 1827858353..3c5ba9bb18 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -390,27 +390,6 @@ def test_remote_model_predict( ) -@pytest.mark.flaky(retries=2) -def test_model_generate_text( - bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df -): - options = { - "temperature": 0.5, - "max_output_tokens": 100, - "top_k": 20, - "top_p": 0.5, - "flatten_json_output": True, - } - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - df = bqml_palm2_text_generator_model.generate_text( - llm_text_df, options=options - ).to_pandas(allow_large_results=True) - - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - @pytest.mark.parametrize("id_col_name", [None, "id"]) def test_model_forecast( time_series_bqml_arima_plus_model: core.BqmlModel, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 7e7a532f79..544889bf5a 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -24,187 +24,6 @@ from tests.system import utils -# Until b/401630655 is resolved, ML apis return json, not compatible with allow_large_results=False -@pytest.fixture(scope="module", autouse=True) -def always_create_table(): - with bigframes.option_context("bigquery.allow_large_results", True): - yield - - -def test_create_load_text_generator_model( - palm2_text_generator_model, dataset_id, bq_connection -): - # Model creation doesn't return error - assert palm2_text_generator_model is not None - assert palm2_text_generator_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = palm2_text_generator_model.to_gbq( - f"{dataset_id}.temp_text_model", replace=True - ) - assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.model_name == "text-bison" - assert reloaded_model.connection_name == bq_connection - - -def test_create_load_text_generator_32k_model( - palm2_text_generator_32k_model, dataset_id, bq_connection -): - # Model creation doesn't return error - assert palm2_text_generator_32k_model is not None - assert palm2_text_generator_32k_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = palm2_text_generator_32k_model.to_gbq( - f"{dataset_id}.temp_text_model", replace=True - ) - assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.model_name == "text-bison-32k" - assert reloaded_model.connection_name == bq_connection - - -@pytest.mark.flaky(retries=2) -def test_create_text_generator_model_default_session( - bq_connection, llm_text_pandas_df, bigquery_client -): - import bigframes.pandas as bpd - - # Note: This starts a thread-local session. - with bpd.option_context( - "bigquery.bq_connection", - bq_connection, - "bigquery.location", - "US", - ): - model = llm.PaLM2TextGenerator() - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-rf-conn" - ) - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - df = model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_create_text_generator_32k_model_default_session( - bq_connection, llm_text_pandas_df, bigquery_client -): - import bigframes.pandas as bpd - - # Note: This starts a thread-local session. - with bpd.option_context( - "bigquery.bq_connection", - bq_connection, - "bigquery.location", - "US", - ): - model = llm.PaLM2TextGenerator(model_name="text-bison-32k") - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-rf-conn" - ) - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - df = model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_create_text_generator_model_default_connection( - llm_text_pandas_df, bigquery_client -): - from bigframes import _config - import bigframes.pandas as bpd - - bpd.close_session() - _config.options = _config.Options() # reset configs - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - model = llm.PaLM2TextGenerator() - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-default-connection" - ) - - df = model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -# Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_default_params_success( - palm2_text_generator_model, llm_text_df -): - df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_series_default_params_success( - palm2_text_generator_model, llm_text_df -): - df = palm2_text_generator_model.predict(llm_text_df["prompt"]).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_arbitrary_col_label_success( - palm2_text_generator_model, llm_text_df -): - llm_text_df = llm_text_df.rename(columns={"prompt": "arbitrary"}) - df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_multiple_cols_success( - palm2_text_generator_model, llm_text_df: bpd.DataFrame -): - df = llm_text_df.assign(additional_col=1) - pd_df = palm2_text_generator_model.predict(df).to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=utils.ML_GENERATE_TEXT_OUTPUT + ["additional_col"], - index=3, - col_exact=False, - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_with_params_success( - palm2_text_generator_model, llm_text_df -): - df = palm2_text_generator_model.predict( - llm_text_df, temperature=0.5, max_output_tokens=100, top_k=20, top_p=0.5 - ).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - @pytest.mark.parametrize( "model_name", ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), @@ -287,25 +106,6 @@ def test_create_load_multimodal_embedding_generator_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2) -def test_multimodal_embedding_generator_predict_default_params_success( - images_mm_df, session, bq_connection -): - bigframes.options.experiments.blob = True - - text_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - df = text_embedding_model.predict(images_mm_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, - columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, - index=2, - col_exact=False, - ) - assert len(df["ml_generate_embedding_result"][0]) == 1408 - - @pytest.mark.parametrize( "model_name", ( @@ -316,6 +116,8 @@ def test_multimodal_embedding_generator_predict_default_params_success( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky( @@ -420,36 +222,6 @@ def test_gemini_text_generator_multi_cols_predict_success( ) -@pytest.mark.parametrize( - "model_name", - ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", - "gemini-2.0-flash-exp", - ), -) -@pytest.mark.flaky(retries=2) -def test_gemini_text_generator_multimodal_input( - images_mm_df: bpd.DataFrame, model_name, session, bq_connection -): - bigframes.options.experiments.blob = True - - gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - pd_df = gemini_text_generator_model.predict( - images_mm_df, prompt=["Describe", images_mm_df["blob_col"]] - ).to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=utils.ML_GENERATE_TEXT_OUTPUT + ["blob_col"], - index=2, - col_exact=False, - ) - - # Overrides __eq__ function for comparing as mock.call parameter class EqCmpAllDataFrame(bpd.DataFrame): def __eq__(self, other): @@ -922,50 +694,6 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): ) -@pytest.mark.flaky(retries=2) -def test_llm_palm_score(llm_fine_tune_df_default_index): - model = llm.PaLM2TextGenerator(model_name="text-bison") - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index[["prompt"]], - y=llm_fine_tune_df_default_index[["label"]], - ).to_pandas() - utils.check_pandas_df_schema_and_index( - score_result, - columns=[ - "bleu4_score", - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - "evaluation_status", - ], - index=1, - ) - - -@pytest.mark.flaky(retries=2) -def test_llm_palm_score_params(llm_fine_tune_df_default_index): - model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index["prompt"], - y=llm_fine_tune_df_default_index["label"], - task_type="classification", - ).to_pandas() - utils.check_pandas_df_schema_and_index( - score_result, - columns=[ - "precision", - "recall", - "f1_score", - "label", - "evaluation_status", - ], - ) - - @pytest.mark.flaky(retries=2) @pytest.mark.parametrize( "model_name", @@ -1023,41 +751,6 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) ) -def test_palm2_text_generator_deprecated(): - with pytest.warns(exceptions.ApiDeprecationWarning): - llm.PaLM2TextGenerator() - - -def test_palm2_text_embedding_deprecated(): - with pytest.warns(exceptions.ApiDeprecationWarning): - try: - llm.PaLM2TextEmbeddingGenerator() - except (Exception): - pass - - -@pytest.mark.parametrize( - "model_name", - ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", - ), -) -def test_gemini_text_generator_deprecated(model_name): - with pytest.warns(exceptions.ApiDeprecationWarning): - llm.GeminiTextGenerator(model_name=model_name) - - -def test_gemini_pro_text_generator_deprecated(): - with pytest.warns(exceptions.ApiDeprecationWarning): - try: - llm.GeminiTextGenerator(model_name="gemini-pro") - except (Exception): - pass - - @pytest.mark.parametrize( "model_name", ( @@ -1069,3 +762,19 @@ def test_gemini_pro_text_generator_deprecated(): def test_gemini_preview_model_warnings(model_name): with pytest.warns(exceptions.PreviewWarning): llm.GeminiTextGenerator(model_name=model_name) + + +@pytest.mark.parametrize( + "model_class", + [ + llm.TextEmbeddingGenerator, + llm.MultimodalEmbeddingGenerator, + llm.GeminiTextGenerator, + llm.Claude3TextGenerator, + ], +) +def test_text_embedding_generator_no_default_model_warning(model_class): + message = "Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message." + bigframes.options.experiments.blob = True + with pytest.warns(FutureWarning, match=message): + model_class(model_name=None) diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py new file mode 100644 index 0000000000..51e6bcb2d5 --- /dev/null +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -0,0 +1,69 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes +from bigframes.ml import llm +import bigframes.pandas as bpd +from tests.system import utils + + +@pytest.mark.flaky(retries=2) +def test_multimodal_embedding_generator_predict_default_params_success( + images_mm_df, test_session, bq_connection +): + bigframes.options.experiments.blob = True + + text_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=test_session + ) + df = text_embedding_model.predict(images_mm_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, + columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, + index=2, + col_exact=False, + ) + assert len(df["ml_generate_embedding_result"][0]) == 1408 + + +@pytest.mark.parametrize( + "model_name", + ( + "gemini-1.5-pro-001", + "gemini-1.5-pro-002", + "gemini-1.5-flash-001", + "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", + ), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_generator_multimodal_input( + images_mm_df: bpd.DataFrame, model_name, test_session, bq_connection +): + bigframes.options.experiments.blob = True + + gemini_text_generator_model = llm.GeminiTextGenerator( + model_name=model_name, connection_name=bq_connection, session=test_session + ) + pd_df = gemini_text_generator_model.predict( + images_mm_df, prompt=["Describe", images_mm_df["blob_col"]] + ).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_TEXT_OUTPUT + ["blob_col"], + index=2, + col_exact=False, + ) diff --git a/tests/system/small/ml/test_register.py b/tests/system/small/ml/test_register.py index 6d8ff0a712..f21567da63 100644 --- a/tests/system/small/ml/test_register.py +++ b/tests/system/small/ml/test_register.py @@ -14,9 +14,7 @@ from typing import cast -import pytest - -from bigframes.ml import core, imported, linear_model, llm +from bigframes.ml import core, imported, linear_model def test_linear_reg_register( @@ -53,13 +51,6 @@ def test_linear_reg_register_with_params( ) -def test_palm2_text_generator_register( - ephemera_palm2_text_generator_model: llm.PaLM2TextGenerator, -): - with pytest.raises(AttributeError): - ephemera_palm2_text_generator_model.register() # type: ignore - - def test_imported_tensorflow_register( ephemera_imported_tensorflow_model: imported.TensorFlowModel, ): diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index ca83604dd5..0463124309 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -20,7 +20,7 @@ import pytest import bigframes.series -from tests.system.utils import assert_series_equal, skip_legacy_pandas +from tests.system.utils import assert_series_equal DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] DATE_COLUMNS = [ @@ -34,8 +34,9 @@ ("col_name",), DATE_COLUMNS, ) -@skip_legacy_pandas def test_dt_day(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.day.to_pandas() @@ -51,8 +52,9 @@ def test_dt_day(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_date(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.date.to_pandas() @@ -68,8 +70,9 @@ def test_dt_date(scalars_dfs, col_name): ("col_name",), DATE_COLUMNS, ) -@skip_legacy_pandas def test_dt_dayofweek(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.dayofweek.to_pandas() @@ -82,8 +85,9 @@ def test_dt_dayofweek(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_hour(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.hour.to_pandas() @@ -99,8 +103,9 @@ def test_dt_hour(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_minute(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.minute.to_pandas() @@ -116,8 +121,9 @@ def test_dt_minute(scalars_dfs, col_name): ("col_name",), DATE_COLUMNS, ) -@skip_legacy_pandas def test_dt_month(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.month.to_pandas() @@ -133,8 +139,9 @@ def test_dt_month(scalars_dfs, col_name): ("col_name",), DATE_COLUMNS, ) -@skip_legacy_pandas def test_dt_quarter(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.quarter.to_pandas() @@ -150,8 +157,9 @@ def test_dt_quarter(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_second(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.second.to_pandas() @@ -167,8 +175,9 @@ def test_dt_second(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_time(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.time.to_pandas() @@ -184,8 +193,9 @@ def test_dt_time(scalars_dfs, col_name): ("col_name",), DATE_COLUMNS, ) -@skip_legacy_pandas def test_dt_year(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.year.to_pandas() @@ -201,8 +211,9 @@ def test_dt_year(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_tz(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.tz @@ -215,8 +226,9 @@ def test_dt_tz(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_unit(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.unit @@ -234,8 +246,9 @@ def test_dt_unit(scalars_dfs, col_name): ("datetime_col", "%H:%M"), ], ) -@skip_legacy_pandas def test_dt_strftime(scalars_df_index, scalars_pandas_df_index, column, date_format): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_index[column].dt.strftime(date_format).to_pandas() pd_result = scalars_pandas_df_index[column].dt.strftime(date_format) pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) @@ -276,8 +289,9 @@ def test_dt_strftime_time(): ("col_name",), DATETIME_COL_NAMES, ) -@skip_legacy_pandas def test_dt_normalize(scalars_dfs, col_name): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name].dt.normalize().to_pandas() pd_result = scalars_pandas_df[col_name].dt.normalize() @@ -297,8 +311,9 @@ def test_dt_normalize(scalars_dfs, col_name): ("datetime_col", "us"), ], ) -@skip_legacy_pandas def test_dt_floor(scalars_dfs, col_name, freq): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name].dt.floor(freq).to_pandas() pd_result = scalars_pandas_df[col_name].dt.floor(freq) diff --git a/tests/system/small/test_bq_sessions.py b/tests/system/small/test_bq_sessions.py new file mode 100644 index 0000000000..7aad19bd8f --- /dev/null +++ b/tests/system/small/test_bq_sessions.py @@ -0,0 +1,85 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from concurrent.futures import ThreadPoolExecutor +import time + +import google +import google.api_core.exceptions +import google.cloud +from google.cloud import bigquery +import pytest + +from bigframes.session import bigquery_session + +TEST_SCHEMA = [ + bigquery.SchemaField("bool field", "BOOLEAN"), + bigquery.SchemaField("string field", "STRING"), + bigquery.SchemaField("float array_field", "FLOAT", mode="REPEATED"), + bigquery.SchemaField( + "struct field", + "RECORD", + fields=(bigquery.SchemaField("int subfield", "INTEGER"),), + ), +] + + +@pytest.fixture +def session_resource_manager( + bigquery_client, +) -> bigquery_session.SessionResourceManager: + return bigquery_session.SessionResourceManager(bigquery_client, "US") + + +def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client): + session_resource_manager = bigquery_session.SessionResourceManager( + bigquery_client, "US" + ) + cluster_cols = ["string field", "bool field"] + + session_table_ref = session_resource_manager.create_temp_table( + TEST_SCHEMA, cluster_cols=cluster_cols + ) + session_resource_manager._keep_session_alive() + + result_table = bigquery_client.get_table(session_table_ref) + assert result_table.schema == TEST_SCHEMA + assert result_table.clustering_fields == cluster_cols + + session_resource_manager.close() + with pytest.raises(google.api_core.exceptions.NotFound): + # It may take time for the underlying tables to get cleaned up after + # closing the session, so wait at least 1 minute to check. + for _ in range(6): + bigquery_client.get_table(session_table_ref) + time.sleep(10) + + +def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client): + session_resource_manager = bigquery_session.SessionResourceManager( + bigquery_client, "US" + ) + + def create_table(): + return session_resource_manager.create_temp_table(TEST_SCHEMA) + + with ThreadPoolExecutor() as executor: + results = [executor.submit(create_table) for i in range(10)] + + for future in results: + table = future.result() + result_table = bigquery_client.get_table(table) + assert result_table.schema == TEST_SCHEMA + + session_resource_manager.close() diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8cc3be1577..e77319b551 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -38,7 +38,6 @@ assert_pandas_df_equal, assert_series_equal, assert_series_equivalent, - skip_legacy_pandas, ) @@ -78,6 +77,23 @@ def test_df_construct_pandas_default(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("write_engine"), + [ + ("bigquery_inline"), + ("bigquery_load"), + ("bigquery_streaming"), + ], +) +def test_read_pandas_all_nice_types( + session: bigframes.Session, scalars_pandas_df_index: pd.DataFrame, write_engine +): + bf_result = session.read_pandas( + scalars_pandas_df_index, write_engine=write_engine + ).to_pandas() + pandas.testing.assert_frame_equal(bf_result, scalars_pandas_df_index) + + def test_df_construct_large_strings(): data = [["hello", "w" + "o" * 50000 + "rld"]] bf_result = dataframe.DataFrame(data).to_pandas() @@ -614,8 +630,9 @@ def test_drop_bigframes_index_with_na(scalars_dfs): pd.testing.assert_frame_equal(pd_result, bf_result) -@skip_legacy_pandas def test_drop_bigframes_multiindex(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() scalars_pandas_df = scalars_pandas_df.copy() @@ -1146,7 +1163,6 @@ def test_assign_callable_lambda(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) -@skip_legacy_pandas @pytest.mark.parametrize( ("axis", "how", "ignore_index", "subset"), [ @@ -1160,6 +1176,8 @@ def test_assign_callable_lambda(scalars_dfs): ], ) def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) bf_result = df.to_pandas() @@ -1172,8 +1190,9 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): pandas.testing.assert_frame_equal(bf_result, pd_result) -@skip_legacy_pandas def test_df_dropna_range_columns(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() scalars_pandas_df = scalars_pandas_df.copy() @@ -1372,11 +1391,12 @@ def test_df_iter( assert bf_i == df_i -@skip_legacy_pandas def test_iterrows( scalars_df_index, scalars_pandas_df_index, ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) for (bf_index, bf_series), (pd_index, pd_series) in zip( @@ -1743,6 +1763,29 @@ def test_len(scalars_dfs): assert bf_result == pd_result +@pytest.mark.parametrize( + ("n_rows",), + [ + (50,), + (10000,), + ], +) +@pytest.mark.parametrize( + "write_engine", + ["bigquery_load", "bigquery_streaming"], +) +def test_df_len_local(session, n_rows, write_engine): + assert ( + len( + session.read_pandas( + pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), + write_engine=write_engine, + ) + ) + == n_rows + ) + + def test_size(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df.size @@ -2356,8 +2399,9 @@ def test_df_corrwith_df_non_numeric_error(scalars_dfs): scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) -@skip_legacy_pandas def test_df_corrwith_series(scalars_dfs_maybe_ordered): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered l_cols = ["int64_col", "float64_col", "int64_too"] @@ -2486,7 +2530,6 @@ def test_series_binop_axis_index( assert_pandas_df_equal(bf_result, pd_result) -@skip_legacy_pandas @pytest.mark.parametrize( ("input"), [ @@ -2501,6 +2544,8 @@ def test_series_binop_axis_index( ], ) def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs df_columns = ["int64_col", "float64_col", "int64_too"] @@ -2513,8 +2558,9 @@ def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) -@skip_legacy_pandas def test_df_reverse_binop_pandas(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs pd_series = pd.Series([100, 200, 300]) @@ -2957,8 +3003,9 @@ def test_dataframe_agg_int_multi_string(scalars_dfs): ) -@skip_legacy_pandas def test_df_describe_non_temporal(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs # excluding temporal columns here because BigFrames cannot perform percentiles operations on them unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] @@ -2991,9 +3038,10 @@ def test_df_describe_non_temporal(scalars_dfs): ).all() -@skip_legacy_pandas @pytest.mark.parametrize("include", [None, "all"]) def test_df_describe_non_numeric(scalars_dfs, include): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs # Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is @@ -3021,8 +3069,9 @@ def test_df_describe_non_numeric(scalars_dfs, include): ) -@skip_legacy_pandas def test_df_describe_temporal(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] @@ -3048,8 +3097,9 @@ def test_df_describe_temporal(scalars_dfs): ) -@skip_legacy_pandas def test_df_describe_mixed_types_include_all(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs numeric_columns = [ @@ -4607,13 +4657,12 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ], ) def test_df_drop_duplicates_w_json(json_df, keep): - bf_df = json_df.drop_duplicates(keep=keep).to_pandas(allow_large_results=True) + bf_df = json_df.drop_duplicates(keep=keep).to_pandas() # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible # with Arrow string extension types. Temporary conversion to standard Pandas # strings is required. - # allow_large_results=True for b/401630655 - json_pandas_df = json_df.to_pandas(allow_large_results=True) + json_pandas_df = json_df.to_pandas() json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( pd.StringDtype(storage="pyarrow") ) @@ -4731,8 +4780,9 @@ def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result -@skip_legacy_pandas def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: scalars_df_index.to_json(bf_result_file, orient="table") # default_handler for arrow types that have no default conversion @@ -4858,7 +4908,6 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result -@skip_legacy_pandas @pytest.mark.parametrize( ("expr",), [ @@ -4868,6 +4917,8 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): ], ) def test_df_eval(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df.eval(expr).to_pandas() @@ -4876,7 +4927,6 @@ def test_df_eval(scalars_dfs, expr): pd.testing.assert_frame_equal(bf_result, pd_result) -@skip_legacy_pandas @pytest.mark.parametrize( ("expr",), [ @@ -4886,6 +4936,8 @@ def test_df_eval(scalars_dfs, expr): ], ) def test_df_query(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") # local_var is referenced in expressions local_var = 3 # NOQA scalars_df, scalars_pandas_df = scalars_dfs @@ -5204,9 +5256,7 @@ def test_query_complexity_repeated_subtrees( # See: https://github.com/python/cpython/issues/112282 reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", ) -def test_query_complexity_repeated_analytic( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): +def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): bf_df = scalars_df_index[["int64_col", "int64_too"]] pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] # Uses LAG analytic operator, each in a new SELECT @@ -5218,22 +5268,6 @@ def test_query_complexity_repeated_analytic( assert_pandas_df_equal(bf_result, pd_result) -def test_to_pandas_downsampling_option_override(session): - df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") - download_size = 1 - - # limits only apply for allow_large_result=True - df = df.to_pandas( - max_download_size=download_size, - sampling_method="head", - allow_large_results=True, - ) - - total_memory_bytes = df.memory_usage(deep=True).sum() - total_memory_mb = total_memory_bytes / (1024 * 1024) - assert total_memory_mb == pytest.approx(download_size, rel=0.5) - - def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): dataset_id = dataset_id_not_created destination_table = f"{dataset_id}.scalars_df" @@ -5342,7 +5376,6 @@ def test_dataframe_explode_xfail(col_names): df.explode(col_names) -@skip_legacy_pandas @pytest.mark.parametrize( ("on", "rule", "origin"), [ @@ -5362,6 +5395,8 @@ def test_dataframe_explode_xfail(col_names): def test__resample_with_column( scalars_df_index, scalars_pandas_df_index, on, rule, origin ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") bf_result = ( scalars_df_index._resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] @@ -5377,7 +5412,6 @@ def test__resample_with_column( ) -@skip_legacy_pandas @pytest.mark.parametrize( ("append", "level", "col", "rule"), [ @@ -5389,6 +5423,8 @@ def test__resample_with_column( def test__resample_with_index( scalars_df_index, scalars_pandas_df_index, append, level, col, rule ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append) scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) bf_result = ( @@ -5405,7 +5441,6 @@ def test__resample_with_index( assert_pandas_df_equal(bf_result, pd_result) -@skip_legacy_pandas @pytest.mark.parametrize( ("rule", "origin", "data"), [ @@ -5445,6 +5480,8 @@ def test__resample_with_index( ], ) def test__resample_start_time(rule, origin, data): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" scalars_df_index = bpd.DataFrame(data).set_index(col) scalars_pandas_df_index = pd.DataFrame(data).set_index(col) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index cd21f5094c..a69c26bc54 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -35,6 +35,7 @@ import bigframes import bigframes.dataframe +import bigframes.enums import bigframes.features import bigframes.pandas as bpd @@ -257,7 +258,7 @@ def test_to_pandas_override_global_option(scalars_df_index): scalars_df_index.to_pandas() table_id = scalars_df_index._query_job.destination.table_id - assert table_id.startswith("bqdf") + assert table_id is not None # When allow_large_results=False, a query_job object should not be created. # Therefore, the table_id should remain unchanged. @@ -265,14 +266,69 @@ def test_to_pandas_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + with pytest.warns( + UserWarning, match="The data size .* exceeds the maximum download limit" + ): + # limits only apply for allow_large_result=True + df = df.to_pandas( + max_download_size=download_size, + sampling_method="head", + allow_large_results=True, + ) + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.5) + + +@pytest.mark.parametrize( + ("kwargs", "message"), + [ + pytest.param( + {"sampling_method": "head"}, + r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame.sample", + id="sampling_method", + ), + pytest.param( + {"random_state": 10}, + r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame.sample", + id="random_state", + ), + pytest.param( + {"max_download_size": 10}, + r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame.to_pandas_batches", + id="max_download_size", + ), + ], +) +def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): + with pytest.warns(FutureWarning, match=message): + scalars_df_index.to_pandas( + # limits only apply for allow_large_result=True + allow_large_results=True, + **kwargs, + ) + + +def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): + bf_df = session.read_pandas(scalars_pandas_df_multi_index) + + result = bf_df.to_pandas(dry_run=True) + + assert len(result) == 14 + + def test_to_arrow_override_global_option(scalars_df_index): # Direct call to_arrow uses global default setting (allow_large_results=True), - # table has 'bqdf' prefix. with bigframes.option_context("bigquery.allow_large_results", True): scalars_df_index.to_arrow() table_id = scalars_df_index._query_job.destination.table_id - assert table_id.startswith("bqdf") + assert table_id is not None # When allow_large_results=False, a query_job object should not be created. # Therefore, the table_id should remain unchanged. @@ -288,6 +344,30 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): pd.testing.assert_series_equal(actual, expected) +@pytest.mark.parametrize("allow_large_results", (True, False)) +def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): + """Verify to_pandas_batches() APIs returns the expected page size. + + Regression test for b/407521010. + """ + bf_df = session.read_gbq( + "bigquery-public-data.usa_names.usa_1910_2013", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + expected_column_count = len(bf_df.columns) + + batch_count = 0 + for pd_df in bf_df.to_pandas_batches( + page_size=42, allow_large_results=allow_large_results, max_results=42 * 3 + ): + batch_row_count, batch_column_count = pd_df.shape + batch_count += 1 + assert batch_column_count == expected_column_count + assert batch_row_count == 42 + + assert batch_count == 3 + + @pytest.mark.parametrize( ("index",), [(True,), (False,)], @@ -789,11 +869,3 @@ def test_to_sql_query_named_index_excluded( utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) - - -def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): - bf_df = session.read_pandas(scalars_pandas_df_multi_index) - - result = bf_df.to_pandas(dry_run=True) - - assert len(result) == 14 diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 8ce53c218b..97f44694b0 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -84,36 +84,6 @@ def test_session_query_job(bq_cmek, session_with_bq_cmek): assert table.encryption_configuration.kms_key_name == bq_cmek -def test_session_load_job(bq_cmek, session_with_bq_cmek): - if not bq_cmek: # pragma: NO COVER - pytest.skip("no cmek set for testing") # pragma: NO COVER - - # Session should have cmek set in the default query and load job configs - load_table = session_with_bq_cmek._temp_storage_manager.allocate_temp_table() - - df = pandas.DataFrame({"col0": [1, 2, 3]}) - load_job_config = bigquery.LoadJobConfig() - load_job_config.schema = [ - bigquery.SchemaField(df.columns[0], bigquery.enums.SqlTypeNames.INT64) - ] - - load_job = session_with_bq_cmek.bqclient.load_table_from_dataframe( - df, - load_table, - job_config=load_job_config, - ) - load_job.result() - - assert load_job.destination == load_table - assert load_job.destination_encryption_configuration.kms_key_name.startswith( - bq_cmek - ) - - # The load destination table should be created with the intended encryption - table = session_with_bq_cmek.bqclient.get_table(load_job.destination) - assert table.encryption_configuration.kms_key_name == bq_cmek - - def test_read_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): if not bq_cmek: # pragma: NO COVER pytest.skip("no cmek set for testing") # pragma: NO COVER @@ -194,7 +164,7 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): # Write the result to BQ custom table and assert encryption session_with_bq_cmek.bqclient.get_table(output_table_id) - output_table_ref = session_with_bq_cmek._temp_storage_manager.allocate_temp_table() + output_table_ref = session_with_bq_cmek._anon_dataset_manager.allocate_temp_table() output_table_id = str(output_table_ref) df.to_gbq(output_table_id) output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) @@ -232,7 +202,7 @@ def test_read_pandas_large(bq_cmek, session_with_bq_cmek): _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) -def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): +def test_kms_encryption_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): if not bq_cmek: # pragma: NO COVER pytest.skip("no cmek set for testing") # pragma: NO COVER diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 2e5cd18158..f1d2bacf08 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +from tests.system.utils import assert_pandas_df_equal # ================= # DataFrame.groupby @@ -94,7 +94,6 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q ) -@skip_legacy_pandas @pytest.mark.parametrize( ("na_option", "method", "ascending"), [ @@ -132,6 +131,8 @@ def test_dataframe_groupby_rank( method, ascending, ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") col_names = ["int64_too", "float64_col", "int64_col", "string_col"] bf_result = ( scalars_df_index[col_names] @@ -599,7 +600,6 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) -@skip_legacy_pandas @pytest.mark.parametrize( ("na_option", "method", "ascending"), [ @@ -637,6 +637,8 @@ def test_series_groupby_rank( method, ascending, ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") col_names = ["int64_col", "string_col"] bf_result = ( scalars_df_index[col_names] diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 535e4bc9ae..9f45c8465b 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -426,11 +426,3 @@ def test_multiindex_repr_includes_all_names(session): ) index = session.read_pandas(df).set_index(["A", "B"]).index assert "names=['A', 'B']" in repr(index) - - -def test_to_pandas_dry_run(scalars_df_index): - index = scalars_df_index.index - - result = index.to_pandas(dry_run=True) - - assert len(result) == 14 diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index 85001e4ec5..fcb3fa3920 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -20,10 +20,9 @@ def test_to_pandas_override_global_option(scalars_df_index): bf_index = scalars_df_index.index # Direct call to_pandas uses global default setting (allow_large_results=True), - # table has 'bqdf' prefix. bf_index.to_pandas() table_id = bf_index._query_job.destination.table_id - assert table_id.startswith("bqdf") + assert table_id is not None # When allow_large_results=False, a query_job object should not be created. # Therefore, the table_id should remain unchanged. @@ -31,6 +30,14 @@ def test_to_pandas_override_global_option(scalars_df_index): assert bf_index._query_job.destination.table_id == table_id +def test_to_pandas_dry_run(scalars_df_index): + index = scalars_df_index.index + + result = index.to_pandas(dry_run=True) + + assert len(result) == 14 + + def test_to_numpy_override_global_option(scalars_df_index): with bigframes.option_context("bigquery.allow_large_results", True): @@ -40,7 +47,7 @@ def test_to_numpy_override_global_option(scalars_df_index): # table has 'bqdf' prefix. bf_index.to_numpy() table_id = bf_index._query_job.destination.table_id - assert table_id.startswith("bqdf") + assert table_id is not None # When allow_large_results=False, a query_job object should not be created. # Therefore, the table_id should remain unchanged. diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 1c78ac63d9..a01b7aab92 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -17,7 +17,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +from tests.system.utils import assert_pandas_df_equal def test_multi_index_from_arrays(): @@ -45,8 +45,9 @@ def test_multi_index_from_arrays(): pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) -@skip_legacy_pandas def test_read_pandas_multi_index_axes(): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") index = pandas.MultiIndex.from_arrays( [ pandas.Index([4, 99], dtype=pandas.Int64Dtype()), @@ -759,8 +760,9 @@ def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index) pandas.testing.assert_series_equal(bf_result, pd_result) -@skip_legacy_pandas def test_column_multi_index_any(): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") columns = pandas.MultiIndex.from_tuples( [("col0", "col00"), ("col0", "col00"), ("col1", "col11")] ) diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index 6da4c6ff9c..cf41daeb51 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -18,7 +18,6 @@ import bigframes.exceptions import bigframes.pandas as bpd -from tests.system.utils import skip_legacy_pandas def test_null_index_to_gbq(session, scalars_df_null_index, dataset_id_not_created): @@ -126,8 +125,9 @@ def test_null_index_groupby_aggregate( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) -@skip_legacy_pandas def test_null_index_analytic(scalars_df_null_index, scalars_pandas_df_default_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_null_index["int64_col"].cumsum().to_pandas() pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() pd.testing.assert_series_equal( @@ -173,7 +173,6 @@ def test_null_index_merge_left_null_index_object( assert got.shape == expected.shape -@skip_legacy_pandas @pytest.mark.parametrize( ("expr",), [ @@ -185,6 +184,8 @@ def test_null_index_merge_left_null_index_object( def test_null_index_df_eval( scalars_df_null_index, scalars_pandas_df_default_index, expr ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_null_index.eval(expr).to_pandas() pd_result = scalars_pandas_df_default_index.eval(expr) @@ -237,8 +238,9 @@ def test_null_index_merge_two_null_index_objects( assert got.shape == expected.shape -@skip_legacy_pandas def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") stacking_cols = ["int64_col", "int64_too"] bf_result = scalars_df_null_index[stacking_cols].stack().to_pandas() pd_result = ( diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 2b6dfefb12..491b56d5fc 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,6 +16,7 @@ import typing import pandas as pd +import pyarrow as pa import pytest import pytz @@ -39,6 +40,16 @@ def test_concat_dataframe(scalars_dfs, ordered): assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) +def test_concat_dataframe_w_struct_cols(nested_structs_df, nested_structs_pandas_df): + """Avoid regressions for internal issue 407107482""" + empty_bf_df = bpd.DataFrame(session=nested_structs_df._block.session) + bf_result = bpd.concat((empty_bf_df, nested_structs_df), ignore_index=True) + bf_result = bf_result.to_pandas() + pd_result = pd.concat((pd.DataFrame(), nested_structs_pandas_df), ignore_index=True) + pd_result.index = pd_result.index.astype("Int64") + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_concat_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat( @@ -388,150 +399,229 @@ def test_merge_series(scalars_dfs, merge_how): def _convert_pandas_category(pd_s: pd.Series): + """ + Transforms a pandas Series with Categorical dtype into a bigframes-compatible + Series representing intervals." + """ + # When `labels=False` + if pd.api.types.is_integer_dtype(pd_s.dtype) or pd.api.types.is_float_dtype( + pd_s.dtype + ): + return pd_s.astype("Int64") + if not isinstance(pd_s.dtype, pd.CategoricalDtype): - raise ValueError("Input must be a pandas Series with categorical data.") + raise ValueError( + f"Input must be a pandas Series with categorical data: {pd_s.dtype}" + ) - if len(pd_s.dtype.categories) == 0: - return pd.Series([pd.NA] * len(pd_s), name=pd_s.name) + if pd.api.types.is_object_dtype(pd_s.cat.categories.dtype): + return pd_s.astype(pd.StringDtype(storage="pyarrow")) - pd_interval: pd.IntervalIndex = pd_s.cat.categories[pd_s.cat.codes] # type: ignore - if pd_interval.closed == "left": + if not isinstance(pd_s.cat.categories.dtype, pd.IntervalDtype): + raise ValueError( + f"Must be a IntervalDtype with categorical data: {pd_s.cat.categories.dtype}" + ) + + if pd_s.cat.categories.dtype.closed == "left": # type: ignore left_key = "left_inclusive" right_key = "right_exclusive" else: left_key = "left_exclusive" right_key = "right_inclusive" - return pd.Series( - [ - {left_key: interval.left, right_key: interval.right} + + subtype = pd_s.cat.categories.dtype.subtype # type: ignore + if pd.api.types.is_float_dtype(subtype): + interval_dtype = pa.float64() + elif pd.api.types.is_integer_dtype(subtype): + interval_dtype = pa.int64() + else: + raise ValueError(f"Unknown category type: {subtype}") + + dtype = pd.ArrowDtype( + pa.struct( + [ + pa.field(left_key, interval_dtype, nullable=True), + pa.field(right_key, interval_dtype, nullable=True), + ] + ) + ) + + if len(pd_s.dtype.categories) == 0: + data = [pd.NA] * len(pd_s) + else: + data = [ + {left_key: interval.left, right_key: interval.right} # type: ignore if pd.notna(val) else pd.NA - for val, interval in zip(pd_s, pd_interval) - ], + for val, interval in zip(pd_s, pd_s.cat.categories[pd_s.cat.codes]) # type: ignore + ] + + return pd.Series( + data=data, name=pd_s.name, + dtype=dtype, + index=pd_s.index.astype("Int64"), ) @pytest.mark.parametrize( - ("right"), + ("right", "labels"), [ - pytest.param(True), - pytest.param(False), + pytest.param(True, None, id="right_w_none_labels"), + pytest.param(True, False, id="right_w_false_labels"), + pytest.param(False, None, id="left_w_none_labels"), + pytest.param(False, False, id="left_w_false_labels"), ], ) -def test_cut(scalars_dfs, right): +def test_cut_by_int_bins(scalars_dfs, labels, right): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False, right=right) - bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False, right=right) + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels, right=right) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels, right=right) - # make sure the result is a supported dtype - assert bf_result.dtype == bpd.Int64Dtype() - pd_result = pd_result.astype("Int64") + pd_result = _convert_pandas_category(pd_result) pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) -@pytest.mark.parametrize( - ("right"), - [ - pytest.param(True), - pytest.param(False), - ], -) -def test_cut_default_labels(scalars_dfs, right): +def test_cut_by_int_bins_w_labels(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, right=right) - bf_result = bpd.cut(scalars_df["float64_col"], 5, right=right).to_pandas() + labels = ["A", "B", "C", "D", "E"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels) - # Convert to match data format - pd_result_converted = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( - ("breaks", "right"), + ("breaks", "right", "labels"), [ - pytest.param([0, 5, 10, 15, 20, 100, 1000], True, id="int_right"), - pytest.param([0, 5, 10, 15, 20, 100, 1000], False, id="int_left"), - pytest.param([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5], False, id="float_left"), - pytest.param([0, 5, 10.5, 15.5, 20, 100, 1000.5], True, id="mixed_right"), + pytest.param( + [0, 5, 10, 15, 20, 100, 1000], + True, + None, + id="int_breaks_w_right_closed_and_none_labels", + ), + pytest.param( + [0, 5, 10, 15, 20, 100, 1000], + False, + False, + id="int_breaks_w_left_closed_and_false_labels", + ), + pytest.param( + [0.5, 10.5, 15.5, 20.5, 100.5, 1000.5], + False, + None, + id="float_breaks_w_left_closed_and_none_labels", + ), + pytest.param( + [0, 5, 10.5, 15.5, 20, 100, 1000.5], + True, + False, + id="mixed_types_breaks_w_right_closed_and_false_labels", + ), ], ) -def test_cut_numeric_breaks(scalars_dfs, breaks, right): +def test_cut_by_numeric_breaks(scalars_dfs, breaks, right, labels): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks, right=right) - bf_result = bpd.cut(scalars_df["float64_col"], breaks, right=right).to_pandas() + pd_result = pd.cut( + scalars_pandas_df["float64_col"], breaks, right=right, labels=labels + ) + bf_result = bpd.cut( + scalars_df["float64_col"], breaks, right=right, labels=labels + ).to_pandas() - # Convert to match data format pd_result_converted = _convert_pandas_category(pd_result) - - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + pd.testing.assert_series_equal(bf_result, pd_result_converted) -@pytest.mark.parametrize( - "bins", - [ - pytest.param([], id="empty_list"), - pytest.param( - [1], id="single_int_list", marks=pytest.mark.skip(reason="b/404338651") - ), - pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"), - ], -) -def test_cut_w_edge_cases(scalars_dfs, bins): +def test_cut_by_numeric_breaks_w_labels(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() - if isinstance(bins, list): - bins = pd.IntervalIndex.from_tuples(bins) - pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) - # Convert to match data format - pd_result_converted = _convert_pandas_category(pd_result) + bins = [0, 5, 10, 15, 20] + labels = ["A", "B", "C", "D"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( - ("bins", "right"), + ("bins", "right", "labels"), [ - pytest.param([(-5, 2), (2, 3), (-3000, -10)], True, id="tuple_right"), - pytest.param([(-5, 2), (2, 3), (-3000, -10)], False, id="tuple_left"), + pytest.param( + [(-5, 2), (2, 3), (-3000, -10)], True, None, id="tuple_right_w_none_labels" + ), + pytest.param( + [(-5, 2), (2, 3), (-3000, -10)], + False, + False, + id="tuple_left_w_false_labels", + ), pytest.param( pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]), True, - id="interval_right", + False, + id="interval_right_w_none_labels", ), pytest.param( pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]), False, - id="interval_left", + None, + id="interval_left_w_false_labels", ), ], ) -def test_cut_with_interval(scalars_dfs, bins, right): +def test_cut_by_interval_bins(scalars_dfs, bins, right, labels): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.cut( - scalars_df["int64_too"], bins, labels=False, right=right + scalars_df["int64_too"], bins, labels=labels, right=right ).to_pandas() if isinstance(bins, list): bins = pd.IntervalIndex.from_tuples(bins) - pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False, right=right) + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels, right=right) - # Convert to match data format pd_result_converted = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result, pd_result_converted) - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + +def test_cut_by_interval_bins_w_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bins = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]) + labels = ["A", "B", "C", "D", "E"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) + + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("bins", "labels"), + [ + pytest.param([], None, id="empty_breaks"), + pytest.param([1], False, id="single_int_breaks"), + pytest.param(pd.IntervalIndex.from_tuples([]), None, id="empty_interval_index"), + ], +) +def test_cut_by_edge_cases_bins(scalars_dfs, bins, labels): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=labels).to_pandas() + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels) + + pd_result_converted = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result, pd_result_converted) + + +def test_cut_empty_array_raises_error(): + bf_df = bpd.Series([]) + with pytest.raises(ValueError, match="Cannot cut empty array"): + bpd.cut(bf_df, bins=5) @pytest.mark.parametrize( diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 3139ae5225..9c61c8ea5b 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -55,6 +55,19 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, with bf.option_context("display.progress_bar", "terminal"): penguins_df_default_index["body_mass_g"].head(10).mean() + assert capsys.readouterr().out == "" + + +def test_progress_bar_scalar_allow_large_results( + penguins_df_default_index: bf.dataframe.DataFrame, capsys +): + capsys.readouterr() # clear output + + with bf.option_context( + "display.progress_bar", "terminal", "bigquery.allow_large_results", "True" + ): + penguins_df_default_index["body_mass_g"].head(10).mean() + assert_loading_msg_exist(capsys.readouterr().out) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5ca055dc43..c63bf8e12b 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,7 +24,7 @@ import pandas as pd import pyarrow as pa # type: ignore import pytest -import shapely # type: ignore +import shapely.geometry # type: ignore import bigframes.dtypes as dtypes import bigframes.features @@ -34,7 +34,6 @@ assert_pandas_df_equal, assert_series_equal, get_first_file_from_wildcard, - skip_legacy_pandas, ) @@ -229,7 +228,11 @@ def test_series_construct_from_list_escaped_strings(): def test_series_construct_geodata(): pd_series = pd.Series( - [shapely.Point(1, 1), shapely.Point(2, 2), shapely.Point(3, 3)], + [ + shapely.geometry.Point(1, 1), + shapely.geometry.Point(2, 2), + shapely.geometry.Point(3, 3), + ], dtype=gpd.array.GeometryDtype(), ) @@ -322,24 +325,22 @@ def test_series_construct_local_unordered_has_sequential_index(unordered_session def test_series_construct_w_dtype_for_json(): - # Until b/401630655 is resolved, json, not compatible with allow_large_results=False - with bigframes.option_context("bigquery.allow_large_results", True): - data = [ - "1", - '"str"', - "false", - '["a", {"b": 1}, null]', - None, - '{"a": {"b": [1, 2, 3], "c": true}}', - ] - s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) + data = [ + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', + None, + '{"a": {"b": [1, 2, 3], "c": true}}', + ] + s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) - assert s[0] == "1" - assert s[1] == '"str"' - assert s[2] == "false" - assert s[3] == '["a",{"b":1},null]' - assert pd.isna(s[4]) - assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + assert s[3] == '["a",{"b":1},null]' + assert pd.isna(s[4]) + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' def test_series_keys(scalars_dfs): @@ -402,8 +403,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - series_pandas = series.to_pandas(allow_large_results=True) + series_pandas = series.to_pandas() assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert series_pandas.shape[0] == json_pandas_df.shape[0] @@ -1033,8 +1033,9 @@ def test_series_corr(scalars_dfs): assert math.isclose(pd_result, bf_result) -@skip_legacy_pandas def test_series_autocorr(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["float64_col"].autocorr(2) pd_result = scalars_pandas_df["float64_col"].autocorr(2) @@ -1683,8 +1684,9 @@ def test_binop_right_filtered(scalars_dfs): (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), ], ) -@skip_legacy_pandas def test_series_binop_w_other_types(scalars_dfs, other): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas() @@ -1704,8 +1706,9 @@ def test_series_binop_w_other_types(scalars_dfs, other): (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), ], ) -@skip_legacy_pandas def test_series_reverse_binop_w_other_types(scalars_dfs, other): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_result = (other + scalars_df["int64_col"].head(3)).to_pandas() @@ -1717,8 +1720,9 @@ def test_series_reverse_binop_w_other_types(scalars_dfs, other): ) -@skip_legacy_pandas def test_series_combine_first(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs int64_col = scalars_df["int64_col"].head(7) float64_col = scalars_df["float64_col"].tail(7) @@ -2361,8 +2365,9 @@ def test_series_peek_filtered(scalars_dfs): ) -@skip_legacy_pandas def test_series_peek_force(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs cumsum_df = scalars_df[["int64_col", "int64_too"]].cumsum() @@ -2376,8 +2381,9 @@ def test_series_peek_force(scalars_dfs): ) -@skip_legacy_pandas def test_series_peek_force_float(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs cumsum_df = scalars_df[["int64_col", "float64_col"]].cumsum() @@ -2594,8 +2600,9 @@ def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): ) -@skip_legacy_pandas def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") col_name = "float64_col" # set non-unique index to check implicit alignment bf_series = scalars_df_index.set_index("bool_col")[col_name].fillna(0.0) @@ -3186,8 +3193,9 @@ def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result -@skip_legacy_pandas def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: scalars_df_index.int64_col.to_json(bf_result_file) scalars_pandas_df_index.int64_col.to_json(pd_result_file) @@ -3478,8 +3486,9 @@ def foo(x): # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions ], ) -@skip_legacy_pandas def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) pd.testing.assert_series_equal(bf_result, pd_result) @@ -3513,8 +3522,9 @@ def test_series_astype_error_error(session): session.read_pandas(input).astype("Float64", errors="bad_value") -@skip_legacy_pandas def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") column = "numeric_col" to_type = "Int64" bf_result = scalars_df_index[column].astype(to_type).to_pandas() @@ -3531,10 +3541,11 @@ def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): ("time_col", "int64[pyarrow]"), ], ) -@skip_legacy_pandas def test_date_time_astype_int( scalars_df_index, scalars_pandas_df_index, column, to_type ): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) @@ -4345,6 +4356,21 @@ def test_series_explode_w_aggregate(): assert s.explode().sum() == pd_s.explode().sum() +def test_series_construct_empty_array(): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + s = bigframes.pandas.Series([[]]) + expected = pd.Series( + [[]], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + index=pd.Index([0], dtype=pd.Int64Dtype()), + ) + pd.testing.assert_series_equal( + expected, + s.to_pandas(), + ) + + @pytest.mark.parametrize( ("data"), [ @@ -4363,7 +4389,6 @@ def test_series_explode_null(data): ) -@skip_legacy_pandas @pytest.mark.parametrize( ("append", "level", "col", "rule"), [ @@ -4374,6 +4399,8 @@ def test_series_explode_null(data): ], ) def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ "int64_col" @@ -4384,13 +4411,13 @@ def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col def test_series_struct_get_field_by_attribute( - nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type + nested_structs_df, nested_structs_pandas_df ): if Version(pd.__version__) < Version("2.2.0"): pytest.skip("struct accessor is not supported before pandas 2.2") bf_series = nested_structs_df["person"] - df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type) + df_series = nested_structs_pandas_df["person"] pd.testing.assert_series_equal( bf_series.address.city.to_pandas(), diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index ae09a2cf5d..235ae65750 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -11,7 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd +import pytest + import bigframes +import bigframes.series def test_to_pandas_override_global_option(scalars_df_index): @@ -19,11 +23,10 @@ def test_to_pandas_override_global_option(scalars_df_index): bf_series = scalars_df_index["int64_col"] - # Direct call to_pandas uses global default setting (allow_large_results=True), - # table has 'bqdf' prefix. + # Direct call to_pandas uses global default setting (allow_large_results=True) bf_series.to_pandas() table_id = bf_series._query_job.destination.table_id - assert table_id.startswith("bqdf") + assert table_id is not None session = bf_series._block.session execution_count = session._metrics.execution_count @@ -33,3 +36,81 @@ def test_to_pandas_override_global_option(scalars_df_index): bf_series.to_pandas(allow_large_results=False) assert bf_series._query_job.destination.table_id == table_id assert session._metrics.execution_count - execution_count == 1 + + +@pytest.mark.parametrize( + ("kwargs", "message"), + [ + pytest.param( + {"sampling_method": "head"}, + r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series.sample", + id="sampling_method", + ), + pytest.param( + {"random_state": 10}, + r"DEPRECATED[\S\s]*random_state[\S\s]*Series.sample", + id="random_state", + ), + pytest.param( + {"max_download_size": 10}, + r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series.to_pandas_batches", + id="max_download_size", + ), + ], +) +def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): + s: bigframes.series.Series = scalars_df_index["int64_col"] + with pytest.warns(FutureWarning, match=message): + s.to_pandas( + # limits only apply for allow_large_result=True + allow_large_results=True, + **kwargs, + ) + + +@pytest.mark.parametrize( + ("page_size", "max_results", "allow_large_results"), + [ + pytest.param(None, None, True), + pytest.param(2, None, False), + pytest.param(None, 1, True), + pytest.param(2, 5, False), + pytest.param(3, 6, True), + pytest.param(3, 100, False), + pytest.param(100, 100, True), + ], +) +def test_to_pandas_batches(scalars_dfs, page_size, max_results, allow_large_results): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + total_rows = 0 + expected_total_rows = ( + min(max_results, len(pd_series)) if max_results else len(pd_series) + ) + + hit_last_page = False + for s in bf_series.to_pandas_batches( + page_size=page_size, + max_results=max_results, + allow_large_results=allow_large_results, + ): + assert not hit_last_page + + actual_rows = s.shape[0] + expected_rows = ( + min(page_size, expected_total_rows) if page_size else expected_total_rows + ) + + assert actual_rows <= expected_rows + if actual_rows < expected_rows: + assert page_size + hit_last_page = True + + pd.testing.assert_series_equal( + s, pd_series[total_rows : total_rows + actual_rows] + ) + total_rows += actual_rows + + assert total_rows == expected_total_rows diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e286c40450..24edc91c93 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -39,6 +39,33 @@ from tests.system import utils +@pytest.fixture(scope="module") +def df_and_local_csv(scalars_df_index): + # The auto detects of BigQuery load job have restrictions to detect the bytes, + # datetime, numeric and geometry types, so they're skipped here. + drop_columns = ["bytes_col", "datetime_col", "numeric_col", "geography_col"] + scalars_df_index = scalars_df_index.drop(columns=drop_columns) + + with tempfile.TemporaryDirectory() as dir: + # Prepares local CSV file for reading + path = dir + "/write_df_to_local_csv_file.csv" + scalars_df_index.to_csv(path, index=True) + yield scalars_df_index, path + + +@pytest.fixture(scope="module") +def df_and_gcs_csv(scalars_df_index, gcs_folder): + # The auto detects of BigQuery load job have restrictions to detect the bytes, + # datetime, numeric and geometry types, so they're skipped here. + drop_columns = ["bytes_col", "datetime_col", "numeric_col", "geography_col"] + scalars_df_index = scalars_df_index.drop(columns=drop_columns) + + path = gcs_folder + "test_read_csv_w_write_engine*.csv" + read_path = utils.get_first_file_from_wildcard(path) + scalars_df_index.to_csv(path, index=True) + return scalars_df_index, read_path + + def test_read_gbq_tokyo( session_tokyo: bigframes.Session, scalars_table_tokyo: str, @@ -630,8 +657,7 @@ def test_read_gbq_w_json(session): ) ), """ - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True) + df = session.read_gbq(sql, index_col="id") assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) @@ -651,17 +677,14 @@ def test_read_gbq_w_json_and_compare_w_pandas_json(session): df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - result = df.to_pandas(allow_large_results=True) - # These JSON strings are compatible with BigQuery's JSON storage, pd_df = pd.DataFrame( {"json_col": ['{"bar":true,"foo":10}']}, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), ) pd_df.index = pd_df.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) - pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + pd.testing.assert_series_equal(df.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(df["json_col"].to_pandas(), pd_df["json_col"]) def test_read_gbq_w_json_in_struct(session): @@ -697,9 +720,6 @@ def test_read_gbq_w_json_in_struct(session): data = df["struct_col"].struct.field("data") assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - data = data.to_pandas(allow_large_results=True) - assert data[0] == '{"boolean":true}' assert data[1] == '{"int":100}' assert data[2] == '{"float":0.98}' @@ -738,10 +758,7 @@ def test_read_gbq_w_json_in_array(session): assert data.list.len()[0] == 7 assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd_data = data.to_pandas(allow_large_results=True) - - assert pd_data[0] == [ + assert data[0] == [ '{"boolean":true}', '{"int":100}', '{"float":0.98}', @@ -853,17 +870,20 @@ def test_read_pandas_tokyo( ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], ) def test_read_pandas_timedelta_dataframes(session, write_engine): - expected_df = pd.DataFrame({"my_col": pd.to_timedelta([1, 2, 3], unit="d")}) - - actual_result = ( - session.read_pandas(expected_df, write_engine=write_engine) - .to_pandas() - .astype("timedelta64[ns]") + pytest.importorskip( + "pandas", + minversion="2.0.0", + reason="old versions don't support local casting to arrow duration", ) + pandas_df = pd.DataFrame({"my_col": pd.to_timedelta([1, 2, 3], unit="d")}) - if write_engine == "bigquery_streaming": - expected_df.index = pd.Index([pd.NA] * 3, dtype="Int64") - pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) + actual_result = session.read_pandas( + pandas_df, write_engine=write_engine + ).to_pandas() + expected_result = pandas_df.astype(bigframes.dtypes.TIMEDELTA_DTYPE) + expected_result.index = expected_result.index.astype(bigframes.dtypes.INT_DTYPE) + + pd.testing.assert_frame_equal(actual_result, expected_result) @pytest.mark.parametrize( @@ -873,15 +893,12 @@ def test_read_pandas_timedelta_dataframes(session, write_engine): def test_read_pandas_timedelta_series(session, write_engine): expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = ( session.read_pandas(expected_series, write_engine=write_engine) .to_pandas() .astype("timedelta64[ns]") ) - if write_engine == "bigquery_streaming": - expected_series.index = pd.Index([pd.NA] * 3, dtype="Int64") pd.testing.assert_series_equal( actual_result, expected_series, check_index_type=False ) @@ -889,17 +906,16 @@ def test_read_pandas_timedelta_series(session, write_engine): @pytest.mark.parametrize( "write_engine", - ["default", "bigquery_inline", "bigquery_load"], + ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], ) def test_read_pandas_timedelta_index(session, write_engine): expected_index = pd.to_timedelta( [1, 2, 3], unit="d" ) # to_timedelta returns an index - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = ( session.read_pandas(expected_index, write_engine=write_engine) - .to_pandas(allow_large_results=True) + .to_pandas() .astype("timedelta64[ns]") ) @@ -912,7 +928,7 @@ def test_read_pandas_timedelta_index(session, write_engine): pytest.param("default"), pytest.param("bigquery_load"), pytest.param("bigquery_streaming"), - pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param("bigquery_inline"), ], ) def test_read_pandas_json_dataframes(session, write_engine): @@ -926,19 +942,21 @@ def test_read_pandas_json_dataframes(session, write_engine): {"my_col": pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE)} ) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_df, write_engine=write_engine - ).to_pandas(allow_large_results=True) + ).to_pandas() - if write_engine == "bigquery_streaming": - expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64") pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) @pytest.mark.parametrize( - "write_engine", - ["default", "bigquery_load"], + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + pytest.param("bigquery_streaming"), + pytest.param("bigquery_inline"), + ], ) def test_read_pandas_json_series(session, write_engine): json_data = [ @@ -949,10 +967,9 @@ def test_read_pandas_json_series(session, write_engine): ] expected_series = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_series, write_engine=write_engine - ).to_pandas(allow_large_results=True) + ).to_pandas() pd.testing.assert_series_equal( actual_result, expected_series, check_index_type=False ) @@ -963,6 +980,8 @@ def test_read_pandas_json_series(session, write_engine): [ pytest.param("default"), pytest.param("bigquery_load"), + pytest.param("bigquery_streaming"), + pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)), ], ) def test_read_pandas_json_index(session, write_engine): @@ -973,21 +992,19 @@ def test_read_pandas_json_index(session, write_engine): '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] expected_index: pd.Index = pd.Index(json_data, dtype=bigframes.dtypes.JSON_DTYPE) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_index, write_engine=write_engine - ).to_pandas(allow_large_results=True) + ).to_pandas() pd.testing.assert_index_equal(actual_result, expected_index) @pytest.mark.parametrize( ("write_engine"), [ - pytest.param("default"), pytest.param("bigquery_load"), ], ) -def test_read_pandas_w_nested_json(session, write_engine): +def test_read_pandas_w_nested_json_fails(session, write_engine): data = [ [{"json_field": "1"}], [{"json_field": None}], @@ -996,28 +1013,56 @@ def test_read_pandas_w_nested_json(session, write_engine): ] # PyArrow currently lacks support for creating structs or lists containing extension types. # See issue: https://github.com/apache/arrow/issues/45262 - pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())]))) + pa_array = pa.array(data, type=pa.list_(pa.struct([("json_field", pa.string())]))) pd_s = pd.Series( arrays.ArrowExtensionArray(pa_array), # type: ignore dtype=pd.ArrowDtype( - pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)])) + pa.list_(pa.struct([("json_field", bigframes.dtypes.JSON_ARROW_TYPE)])) ), ) with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"): - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - session.read_pandas(pd_s, write_engine=write_engine).to_pandas( - allow_large_results=True - ) + session.read_pandas(pd_s, write_engine=write_engine) @pytest.mark.parametrize( ("write_engine"), [ pytest.param("default"), + pytest.param("bigquery_inline"), + pytest.param("bigquery_streaming"), + ], +) +def test_read_pandas_w_nested_json(session, write_engine): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + data = [ + [{"json_field": "1"}], + [{"json_field": None}], + [{"json_field": '["1","3","5"]'}], + [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] + pa_array = pa.array(data, type=pa.list_(pa.struct([("json_field", pa.string())]))) + pd_s = pd.Series( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("json_field", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + bq_s = ( + session.read_pandas(pd_s, write_engine=write_engine) + .to_pandas() + .reset_index(drop=True) + ) + pd.testing.assert_series_equal(bq_s, pd_s) + + +@pytest.mark.parametrize( + ("write_engine"), + [ pytest.param("bigquery_load"), ], ) -def test_read_pandas_w_nested_json_index(session, write_engine): +def test_read_pandas_w_nested_json_index_fails(session, write_engine): data = [ [{"json_field": "1"}], [{"json_field": None}], @@ -1026,6 +1071,34 @@ def test_read_pandas_w_nested_json_index(session, write_engine): ] # PyArrow currently lacks support for creating structs or lists containing extension types. # See issue: https://github.com/apache/arrow/issues/45262 + pa_array = pa.array(data, type=pa.list_(pa.struct([("json_field", pa.string())]))) + pd_idx: pd.Index = pd.Index( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("json_field", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + with pytest.raises(NotImplementedError, match="Nested JSON types, found in"): + session.read_pandas(pd_idx, write_engine=write_engine) + + +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_inline"), + pytest.param("bigquery_streaming"), + ], +) +def test_read_pandas_w_nested_json_index(session, write_engine): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + data = [ + [{"json_field": "1"}], + [{"json_field": None}], + [{"json_field": '["1","3","5"]'}], + [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())]))) pd_idx: pd.Index = pd.Index( arrays.ArrowExtensionArray(pa_array), # type: ignore @@ -1033,16 +1106,10 @@ def test_read_pandas_w_nested_json_index(session, write_engine): pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)])) ), ) - with pytest.raises( - NotImplementedError, match="Nested JSON types, found in the index" - ): - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - session.read_pandas(pd_idx, write_engine=write_engine).to_pandas( - allow_large_results=True - ) + bq_idx = session.read_pandas(pd_idx, write_engine=write_engine).to_pandas() + pd.testing.assert_index_equal(bq_idx, pd_idx) -@utils.skip_legacy_pandas @pytest.mark.parametrize( ("write_engine",), ( @@ -1052,88 +1119,23 @@ def test_read_pandas_w_nested_json_index(session, write_engine): ("bigquery_streaming",), ), ) -def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder, write_engine): - scalars_df, _ = scalars_dfs - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" - read_path = utils.get_first_file_from_wildcard(path) - scalars_df.to_csv(path, index=False) - dtype = scalars_df.dtypes.to_dict() - dtype.pop("geography_col") - df = session.read_csv( - read_path, - # Convert default pandas dtypes to match BigQuery DataFrames dtypes. - dtype=dtype, - write_engine=write_engine, - ) +def test_read_csv_for_gcs_file_w_write_engine(session, df_and_gcs_csv, write_engine): + scalars_df, path = df_and_gcs_csv - # TODO(chelsealin): If we serialize the index, can more easily compare values. - pd.testing.assert_index_equal(df.columns, scalars_df.columns) - - # The auto detects of BigQuery load job have restrictions to detect the bytes, - # numeric and geometry types, so they're skipped here. - df = df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) - scalars_df = scalars_df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) - assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) - - -def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): - scalars_df, _ = scalars_dfs - path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" - scalars_df.to_csv(path, index=False) - df = session.read_csv( + # Compares results for pandas and bigframes engines + pd_df = session.read_csv( path, - engine="bigquery", - index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, - ) - - # TODO(chelsealin): If we serialize the index, can more easily compare values. - pd.testing.assert_index_equal(df.columns, scalars_df.columns) - - # The auto detects of BigQuery load job have restrictions to detect the bytes, - # datetime, numeric and geometry types, so they're skipped here. - df = df.drop(columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"]) - scalars_df = scalars_df.drop( - columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] + index_col="rowindex", + write_engine=write_engine, + dtype=scalars_df.dtypes.to_dict(), ) - assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) - - -@pytest.mark.parametrize( - "sep", - [ - pytest.param(",", id="default_sep"), - pytest.param("\t", id="custom_sep"), - ], -) -@utils.skip_legacy_pandas -def test_read_csv_local_default_engine(session, scalars_dfs, sep): - scalars_df, scalars_pandas_df = scalars_dfs - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_default_engine.csv" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df.to_csv(path, index=False, sep=sep) - dtype = scalars_df.dtypes.to_dict() - dtype.pop("geography_col") - df = session.read_csv( - path, - sep=sep, - # Convert default pandas dtypes to match BigQuery DataFrames dtypes. - dtype=dtype, - ) + pd.testing.assert_frame_equal(pd_df.to_pandas(), scalars_df.to_pandas()) - # TODO(chelsealin): If we serialize the index, can more easily compare values. - pd.testing.assert_index_equal(df.columns, scalars_df.columns) - - # The auto detects of BigQuery load job have restrictions to detect the bytes, - # numeric and geometry types, so they're skipped here. - df = df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) - scalars_df = scalars_df.drop( - columns=["bytes_col", "numeric_col", "geography_col"] + if write_engine in ("default", "bigquery_load"): + bf_df = session.read_csv( + path, engine="bigquery", index_col="rowindex", write_engine=write_engine ) - assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1143,71 +1145,77 @@ def test_read_csv_local_default_engine(session, scalars_dfs, sep): pytest.param("\t", id="custom_sep"), ], ) -def test_read_csv_local_bq_engine(session, scalars_dfs, sep): - scalars_df, scalars_pandas_df = scalars_dfs - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_bq_engine.csv" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df.to_csv(path, index=False, sep=sep) - df = session.read_csv(path, engine="bigquery", sep=sep) +def test_read_csv_for_local_file_w_sep(session, df_and_local_csv, sep): + scalars_df, _ = df_and_local_csv - # TODO(chelsealin): If we serialize the index, can more easily compare values. - pd.testing.assert_index_equal(df.columns, scalars_df.columns) + with tempfile.TemporaryDirectory() as dir: + # Prepares local CSV file for reading + path = dir + "/test_read_csv_for_local_file_w_sep.csv" + scalars_df.to_csv(path, index=True, sep=sep) - # The auto detects of BigQuery load job have restrictions to detect the bytes, - # datetime, numeric and geometry types, so they're skipped here. - df = df.drop( - columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] + # Compares results for pandas and bigframes engines + with open(path, "rb") as buffer: + bf_df = session.read_csv( + buffer, engine="bigquery", index_col="rowindex", sep=sep + ) + with open(path, "rb") as buffer: + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + buffer, index_col="rowindex", sep=sep, dtype=scalars_df.dtypes.to_dict() + ) + pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_w_index_col_false(session, df_and_local_csv): + # Compares results for pandas and bigframes engines + scalars_df, path = df_and_local_csv + with open(path, "rb") as buffer: + bf_df = session.read_csv( + buffer, + engine="bigquery", + index_col=False, ) - scalars_df = scalars_df.drop( - columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] + with open(path, "rb") as buffer: + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + buffer, index_col=False, dtype=scalars_df.dtypes.to_dict() ) - assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + assert bf_df.shape[0] == scalars_df.shape[0] + assert bf_df.shape[0] == pd_df.shape[0] -def test_read_csv_localbuffer_bq_engine(session, scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_bq_engine.csv" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df.to_csv(path, index=False) - with open(path, "rb") as buffer: - df = session.read_csv(buffer, engine="bigquery") + # We use a default index because of index_col=False, so the previous index + # column is just loaded as a column. + assert len(bf_df.columns) == len(scalars_df.columns) + 1 + assert len(bf_df.columns) == len(pd_df.columns) - # TODO(chelsealin): If we serialize the index, can more easily compare values. - pd.testing.assert_index_equal(df.columns, scalars_df.columns) + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("rowindex").sort_index() + pd_df = pd_df.set_index("rowindex") - # The auto detects of BigQuery load job have restrictions to detect the bytes, - # datetime, numeric and geometry types, so they're skipped here. - df = df.drop( - columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] - ) - scalars_df = scalars_df.drop( - columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] - ) - assert df.shape[0] == scalars_df.shape[0] - pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) -def test_read_csv_bq_engine_supports_index_col_false( - session, scalars_df_index, gcs_folder -): - path = gcs_folder + "test_read_csv_bq_engine_supports_index_col_false*.csv" - read_path = utils.get_first_file_from_wildcard(path) - scalars_df_index.to_csv(path) +def test_read_csv_w_index_col_column_label(session, df_and_gcs_csv): + scalars_df, path = df_and_gcs_csv + bf_df = session.read_csv(path, engine="bigquery", index_col="rowindex") - df = session.read_csv( - read_path, - # Normally, pandas uses the first column as the index. index_col=False - # turns off that behavior. - index_col=False, + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + path, index_col="rowindex", dtype=scalars_df.dtypes.to_dict() ) - assert df.shape[0] == scalars_df_index.shape[0] - # We use a default index because of index_col=False, so the previous index - # column is just loaded as a column. - assert len(df.columns) == len(scalars_df_index.columns) + 1 + assert bf_df.shape == scalars_df.shape + assert bf_df.shape == pd_df.shape + + assert len(bf_df.columns) == len(scalars_df.columns) + assert len(bf_df.columns) == len(pd_df.columns) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @pytest.mark.parametrize( @@ -1242,155 +1250,62 @@ def test_read_csv_default_engine_throws_not_implemented_error( session.read_csv(read_path, **kwargs) -def test_read_csv_gcs_default_engine_w_header(session, scalars_df_index, gcs_folder): - path = gcs_folder + "test_read_csv_gcs_default_engine_w_header*.csv" - read_path = utils.get_first_file_from_wildcard(path) - scalars_df_index.to_csv(path) - - # Skips header=N rows, normally considers the N+1th row as the header, but overridden by - # passing the `names` argument. In this case, pandas will skip the N+1th row too, take - # the column names from `names`, and begin reading data from the N+2th row. - df = session.read_csv( - read_path, - header=2, - names=scalars_df_index.columns.to_list(), +@pytest.mark.parametrize( + "header", + [0, 1, 5], +) +def test_read_csv_for_gcs_file_w_header(session, df_and_gcs_csv, header): + # Compares results for pandas and bigframes engines + scalars_df, path = df_and_gcs_csv + bf_df = session.read_csv(path, engine="bigquery", index_col=False, header=header) + pd_df = session.read_csv( + path, index_col=False, header=header, dtype=scalars_df.dtypes.to_dict() ) - assert df.shape[0] == scalars_df_index.shape[0] - 2 - assert len(df.columns) == len(scalars_df_index.columns) - - -def test_read_csv_gcs_bq_engine_w_header(session, scalars_df_index, gcs_folder): - path = gcs_folder + "test_read_csv_gcs_bq_engine_w_header*.csv" - scalars_df_index.to_csv(path, index=False) - - # Skip the header and the first 2 data rows. Note that one line of header - # also got added while writing the csv through `to_csv`, so we would have to - # pass headers=3 in the `read_csv` to skip reading the header and two rows. - # Without provided schema, the column names would be like `bool_field_0`, - # `string_field_1` and etc. - df = session.read_csv(path, header=3, engine="bigquery") - assert df.shape[0] == scalars_df_index.shape[0] - 2 - assert len(df.columns) == len(scalars_df_index.columns) - - -def test_read_csv_local_default_engine_w_header(session, scalars_pandas_df_index): - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_default_engine_w_header.csv" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df_index.to_csv(path, index=False) - - # Skips header=N rows. Normally row N+1 would be the header now, but overridden by - # passing the `names` argument. In this case, pandas will skip row N+1 too, infer - # the column names from `names`, and begin reading data from row N+2. - df = session.read_csv( - path, - header=2, - names=scalars_pandas_df_index.columns.to_list(), - ) - assert df.shape[0] == scalars_pandas_df_index.shape[0] - 2 - assert len(df.columns) == len(scalars_pandas_df_index.columns) - - -def test_read_csv_local_bq_engine_w_header(session, scalars_pandas_df_index): - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_bq_engine_w_header.csv" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df_index.to_csv(path, index=False) - - # Skip the header and the first 2 data rows. Note that one line of - # header also got added while writing the csv through `to_csv`, so we - # would have to pass headers=3 in the `read_csv` to skip reading the - # header and two rows. Without provided schema, the column names would - # be like `bool_field_0`, `string_field_1` and etc. - df = session.read_csv(path, header=3, engine="bigquery") - assert df.shape[0] == scalars_pandas_df_index.shape[0] - 2 - assert len(df.columns) == len(scalars_pandas_df_index.columns) - - -def test_read_csv_gcs_default_engine_w_index_col_name( - session, scalars_df_default_index, gcs_folder -): - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_name*.csv" - read_path = utils.get_first_file_from_wildcard(path) - scalars_df_default_index.to_csv(path) - df = session.read_csv(read_path, index_col="rowindex") - scalars_df_default_index = scalars_df_default_index.set_index( - "rowindex" - ).sort_index() - pd.testing.assert_index_equal(df.columns, scalars_df_default_index.columns) - assert df.index.name == "rowindex" + # b/408461403: workaround the issue where the slice does not work for DataFrame. + expected_df = session.read_pandas(scalars_df.to_pandas()[header:]) + assert pd_df.shape[0] == expected_df.shape[0] + assert bf_df.shape[0] == pd_df.shape[0] -def test_read_csv_gcs_default_engine_w_index_col_index( - session, scalars_df_default_index, gcs_folder -): - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_index*.csv" - read_path = utils.get_first_file_from_wildcard(path) - scalars_df_default_index.to_csv(path) - - index_col = scalars_df_default_index.columns.to_list().index("rowindex") - df = session.read_csv(read_path, index_col=index_col) - scalars_df_default_index = scalars_df_default_index.set_index( - "rowindex" - ).sort_index() - pd.testing.assert_index_equal(df.columns, scalars_df_default_index.columns) - assert df.index.name == "rowindex" - - -def test_read_csv_local_default_engine_w_index_col_name( - session, scalars_pandas_df_default_index -): - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_default_engine_w_index_col_name" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df_default_index.to_csv(path, index=False) - - df = session.read_csv(path, index_col="rowindex") - scalars_pandas_df_default_index = scalars_pandas_df_default_index.set_index( - "rowindex" - ).sort_index() - pd.testing.assert_index_equal( - df.columns, scalars_pandas_df_default_index.columns + # We use a default index because of index_col=False, so the previous index + # column is just loaded as a column. + assert len(pd_df.columns) == len(expected_df.columns) + 1 + assert len(bf_df.columns) == len(pd_df.columns) + + # When `header > 0`, pandas and BigFrames may handle column naming differently. + # Pandas uses the literal content of the specified header row for column names, + # regardless of what it is. BigQuery, however, might generate default names based + # on data type (e.g.,bool_field_0,string_field_1, etc.). + if header == 0: + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("rowindex").sort_index() + pd_df = pd_df.set_index("rowindex") + pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_w_usecols(session, df_and_local_csv): + # Compares results for pandas and bigframes engines + scalars_df, path = df_and_local_csv + with open(path, "rb") as buffer: + bf_df = session.read_csv( + buffer, + engine="bigquery", + usecols=["bool_col"], ) - assert df.index.name == "rowindex" - - -def test_read_csv_local_default_engine_w_index_col_index( - session, scalars_pandas_df_default_index -): - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_default_engine_w_index_col_index" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df_default_index.to_csv(path, index=False) - - index_col = scalars_pandas_df_default_index.columns.to_list().index("rowindex") - df = session.read_csv(path, index_col=index_col) - scalars_pandas_df_default_index = scalars_pandas_df_default_index.set_index( - "rowindex" - ).sort_index() - pd.testing.assert_index_equal( - df.columns, scalars_pandas_df_default_index.columns + with open(path, "rb") as buffer: + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + buffer, + usecols=["bool_col"], + dtype=scalars_df[["bool_col"]].dtypes.to_dict(), ) - assert df.index.name == "rowindex" - - -@pytest.mark.parametrize( - "engine", - [ - pytest.param("bigquery", id="bq_engine"), - pytest.param(None, id="default_engine"), - ], -) -def test_read_csv_gcs_w_usecols(session, scalars_df_index, gcs_folder, engine): - path = gcs_folder + "test_read_csv_gcs_w_usecols" - path = path + "_default_engine*.csv" if engine is None else path + "_bq_engine*.csv" - read_path = utils.get_first_file_from_wildcard(path) if engine is None else path - scalars_df_index.to_csv(path) - # df should only have 1 column which is bool_col. - df = session.read_csv(read_path, usecols=["bool_col"], engine=engine) - assert len(df.columns) == 1 + # Cannot compare two dataframe due to b/408499371. + assert len(bf_df.columns) == 1 + assert len(pd_df.columns) == 1 @pytest.mark.parametrize( @@ -1424,36 +1339,37 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine): pytest.param(None, id="default_engine"), ], ) -def test_read_csv_others(session, engine): +def test_read_csv_for_others_files(session, engine): uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main/tests/data/people.csv" df = session.read_csv(uri, engine=engine) assert len(df.columns) == 3 -@pytest.mark.parametrize( - "engine", - [ - pytest.param("bigquery", id="bq_engine"), - pytest.param(None, id="default_engine"), - ], -) -def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index, engine): +def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index): with tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_local_w_encoding.csv" # Using the pandas to_csv method because the BQ one does not support local write. - penguins_pandas_df_default_index.to_csv( - path, index=False, encoding="ISO-8859-1" - ) + penguins_pandas_df_default_index.index.name = "rowindex" + penguins_pandas_df_default_index.to_csv(path, index=True, encoding="ISO-8859-1") # File can only be read using the same character encoding as when written. - df = session.read_csv(path, engine=engine, encoding="ISO-8859-1") - - # TODO(chelsealin): If we serialize the index, can more easily compare values. - pd.testing.assert_index_equal( - df.columns, penguins_pandas_df_default_index.columns + pd_df = session.read_csv( + path, + index_col="rowindex", + encoding="ISO-8859-1", + dtype=penguins_pandas_df_default_index.dtypes.to_dict(), ) - assert df.shape[0] == penguins_pandas_df_default_index.shape[0] + bf_df = session.read_csv( + path, engine="bigquery", index_col="rowindex", encoding="ISO-8859-1" + ) + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.sort_index() + pd.testing.assert_frame_equal( + bf_df.to_pandas(), penguins_pandas_df_default_index + ) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_pickle_local(session, penguins_pandas_df_default_index, tmp_path): @@ -1706,3 +1622,66 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): assert df.shape[0] == scalars_df.shape[0] pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + + +def test_read_gbq_test(test_session: bigframes.Session): + test_project_id = "bigframes-dev" + test_dataset_id = "test_env_only" + test_table_id = "one_table" + table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}" + actual = test_session.read_gbq(table_id).to_pandas() + + assert actual.shape == (1, 1) + + +@pytest.mark.parametrize( + ("query_or_table", "index_col", "columns"), + [ + pytest.param( + "{scalars_table_id}", + ("int64_col", "string_col", "int64_col"), + ("float64_col", "bool_col"), + id="table_input_index_col_dup", + marks=pytest.mark.xfail( + raises=ValueError, + reason="ValueError: Duplicate names within 'index_col'.", + strict=True, + ), + ), + pytest.param( + """SELECT int64_col, string_col, float64_col, bool_col + FROM `{scalars_table_id}`""", + ("int64_col",), + ("string_col", "float64_col", "string_col"), + id="query_input_columns_dup", + marks=pytest.mark.xfail( + raises=ValueError, + reason="ValueError: Duplicate names within 'columns'.", + strict=True, + ), + ), + pytest.param( + "{scalars_table_id}", + ("int64_col", "string_col"), + ("float64_col", "string_col", "bool_col"), + id="table_input_cross_dup", + marks=pytest.mark.xfail( + raises=ValueError, + reason="ValueError: Overlap between 'index_col' and 'columns'.", + strict=True, + ), + ), + ], +) +def test_read_gbq_duplicate_columns_xfail( + session: bigframes.Session, + scalars_table_id: str, + query_or_table: str, + index_col: tuple, + columns: tuple, +): + session.read_gbq( + query_or_table.format(scalars_table_id=scalars_table_id), + index_col=index_col, + columns=columns, + ) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index f270d1903c..f6a56af7ff 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -19,11 +19,7 @@ import bigframes.exceptions import bigframes.pandas as bpd -from tests.system.utils import ( - assert_pandas_df_equal, - assert_series_equal, - skip_legacy_pandas, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_unordered_mode_sql_no_hash(unordered_session): @@ -77,8 +73,9 @@ def test_unordered_mode_print(unordered_session): print(df) -@skip_legacy_pandas def test_unordered_mode_read_gbq(unordered_session): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") df = unordered_session.read_gbq( """SELECT [1, 3, 2] AS array_column, @@ -221,7 +218,6 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session): df.groupby("a").head(3) -@skip_legacy_pandas @pytest.mark.parametrize( ("rule", "origin", "data"), [ @@ -255,6 +251,8 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session): ], ) def test__resample_with_index(unordered_session, rule, origin, data): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" scalars_df_index = bpd.DataFrame(data, session=unordered_session).set_index(col) scalars_pandas_df_index = pd.DataFrame(data).set_index(col) diff --git a/tests/system/small/test_window.py b/tests/system/small/test_window.py index 68613f1372..b48bb8bc86 100644 --- a/tests/system/small/test_window.py +++ b/tests/system/small/test_window.py @@ -12,24 +12,42 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + +import numpy as np import pandas as pd import pytest +from bigframes import dtypes + @pytest.fixture(scope="module") -def rolling_dfs(scalars_dfs): +def rows_rolling_dfs(scalars_dfs): bf_df, pd_df = scalars_dfs - target_cols = ["int64_too", "float64_col", "bool_col"] + target_cols = ["int64_too", "float64_col", "int64_col"] + + return bf_df[target_cols], pd_df[target_cols] + + +@pytest.fixture(scope="module") +def range_rolling_dfs(session): + values = np.arange(20) + pd_df = pd.DataFrame( + { + "ts_col": pd.Timestamp("20250101", tz="UTC") + pd.to_timedelta(values, "s"), + "int_col": values % 4, + "float_col": values / 2, + } + ) - bf_df = bf_df[target_cols].set_index("bool_col") - pd_df = pd_df[target_cols].set_index("bool_col") + bf_df = session.read_pandas(pd_df) return bf_df, pd_df @pytest.fixture(scope="module") -def rolling_series(scalars_dfs): +def rows_rolling_series(scalars_dfs): bf_df, pd_df = scalars_dfs target_col = "int64_too" @@ -37,8 +55,8 @@ def rolling_series(scalars_dfs): @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -def test_dataframe_rolling_closed_param(rolling_dfs, closed): - bf_df, pd_df = rolling_dfs +def test_dataframe_rolling_closed_param(rows_rolling_dfs, closed): + bf_df, pd_df = rows_rolling_dfs actual_result = bf_df.rolling(window=3, closed=closed).sum().to_pandas() @@ -47,38 +65,72 @@ def test_dataframe_rolling_closed_param(rolling_dfs, closed): @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -def test_dataframe_groupby_rolling_closed_param(rolling_dfs, closed): - bf_df, pd_df = rolling_dfs +def test_dataframe_groupby_rolling_closed_param(rows_rolling_dfs, closed): + bf_df, pd_df = rows_rolling_dfs + # Need to specify column subset for comparison due to b/406841327 + check_columns = ["float64_col", "int64_col"] actual_result = ( - bf_df.groupby(level=0).rolling(window=3, closed=closed).sum().to_pandas() + bf_df.groupby(bf_df["int64_too"] % 2) + .rolling(window=3, closed=closed) + .sum() + .to_pandas() ) - expected_result = pd_df.groupby(level=0).rolling(window=3, closed=closed).sum() - pd.testing.assert_frame_equal(actual_result, expected_result, check_dtype=False) + expected_result = ( + pd_df.groupby(pd_df["int64_too"] % 2).rolling(window=3, closed=closed).sum() + ) + pd.testing.assert_frame_equal( + actual_result[check_columns], expected_result, check_dtype=False + ) -def test_dataframe_rolling_default_closed_param(rolling_dfs): - bf_df, pd_df = rolling_dfs +def test_dataframe_rolling_on(rows_rolling_dfs): + bf_df, pd_df = rows_rolling_dfs - actual_result = bf_df.rolling(window=3).sum().to_pandas() + actual_result = bf_df.rolling(window=3, on="int64_too").sum().to_pandas() - expected_result = pd_df.rolling(window=3).sum() + expected_result = pd_df.rolling(window=3, on="int64_too").sum() pd.testing.assert_frame_equal(actual_result, expected_result, check_dtype=False) -def test_dataframe_groupby_rolling_default_closed_param(rolling_dfs): - bf_df, pd_df = rolling_dfs +def test_dataframe_rolling_on_invalid_column_raise_error(rows_rolling_dfs): + bf_df, _ = rows_rolling_dfs - actual_result = bf_df.groupby(level=0).rolling(window=3).sum().to_pandas() + with pytest.raises(ValueError): + bf_df.rolling(window=3, on="whatever").sum() - expected_result = pd_df.groupby(level=0).rolling(window=3).sum() - pd.testing.assert_frame_equal(actual_result, expected_result, check_dtype=False) + +def test_dataframe_groupby_rolling_on(rows_rolling_dfs): + bf_df, pd_df = rows_rolling_dfs + # Need to specify column subset for comparison due to b/406841327 + check_columns = ["float64_col", "int64_col"] + + actual_result = ( + bf_df.groupby(bf_df["int64_too"] % 2) + .rolling(window=3, on="float64_col") + .sum() + .to_pandas() + ) + + expected_result = ( + pd_df.groupby(pd_df["int64_too"] % 2).rolling(window=3, on="float64_col").sum() + ) + pd.testing.assert_frame_equal( + actual_result[check_columns], expected_result, check_dtype=False + ) + + +def test_dataframe_groupby_rolling_on_invalid_column_raise_error(rows_rolling_dfs): + bf_df, _ = rows_rolling_dfs + + with pytest.raises(ValueError): + bf_df.groupby(level=0).rolling(window=3, on="whatever").sum() @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -def test_series_rolling_closed_param(rolling_series, closed): - bf_series, df_series = rolling_series +def test_series_rolling_closed_param(rows_rolling_series, closed): + bf_series, df_series = rows_rolling_series actual_result = bf_series.rolling(window=3, closed=closed).sum().to_pandas() @@ -87,8 +139,8 @@ def test_series_rolling_closed_param(rolling_series, closed): @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -def test_series_groupby_rolling_closed_param(rolling_series, closed): - bf_series, df_series = rolling_series +def test_series_groupby_rolling_closed_param(rows_rolling_series, closed): + bf_series, df_series = rows_rolling_series actual_result = ( bf_series.groupby(bf_series % 2) @@ -103,24 +155,6 @@ def test_series_groupby_rolling_closed_param(rolling_series, closed): pd.testing.assert_series_equal(actual_result, expected_result, check_dtype=False) -def test_series_rolling_default_closed_param(rolling_series): - bf_series, df_series = rolling_series - - actual_result = bf_series.rolling(window=3).sum().to_pandas() - - expected_result = df_series.rolling(window=3).sum() - pd.testing.assert_series_equal(actual_result, expected_result, check_dtype=False) - - -def test_series_groupby_rolling_default_closed_param(rolling_series): - bf_series, df_series = rolling_series - - actual_result = bf_series.groupby(bf_series % 2).rolling(window=3).sum().to_pandas() - - expected_result = df_series.groupby(df_series % 2).rolling(window=3).sum() - pd.testing.assert_series_equal(actual_result, expected_result, check_dtype=False) - - @pytest.mark.parametrize( ("windowing"), [ @@ -146,8 +180,8 @@ def test_series_groupby_rolling_default_closed_param(rolling_series): pytest.param(lambda x: x.var(), id="var"), ], ) -def test_series_window_agg_ops(rolling_series, windowing, agg_op): - bf_series, pd_series = rolling_series +def test_series_window_agg_ops(rows_rolling_series, windowing, agg_op): + bf_series, pd_series = rows_rolling_series actual_result = agg_op(windowing(bf_series)).to_pandas() @@ -181,10 +215,183 @@ def test_series_window_agg_ops(rolling_series, windowing, agg_op): pytest.param(lambda x: x.var(), id="var"), ], ) -def test_dataframe_window_agg_ops(rolling_dfs, windowing, agg_op): - bf_df, pd_df = rolling_dfs +def test_dataframe_window_agg_ops(scalars_dfs, windowing, agg_op): + bf_df, pd_df = scalars_dfs + target_columns = ["int64_too", "float64_col", "bool_col"] + index_column = "bool_col" + bf_df = bf_df[target_columns].set_index(index_column) + pd_df = pd_df[target_columns].set_index(index_column) bf_result = agg_op(windowing(bf_df)).to_pandas() pd_result = agg_op(windowing(pd_df)) pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +@pytest.mark.parametrize( + "window", # skipped numpy timedelta because Pandas does not support it. + [pd.Timedelta("3s"), datetime.timedelta(seconds=3), "3s"], +) +@pytest.mark.parametrize("ascending", [True, False]) +def test_series_range_rolling(range_rolling_dfs, window, closed, ascending): + bf_df, pd_df = range_rolling_dfs + bf_series = bf_df.set_index("ts_col")["int_col"] + pd_series = pd_df.set_index("ts_col")["int_col"] + + actual_result = ( + bf_series.sort_index(ascending=ascending) + .rolling(window=window, closed=closed) + .min() + .to_pandas() + ) + + expected_result = ( + pd_series.sort_index(ascending=ascending) + .rolling(window=window, closed=closed) + .min() + ) + pd.testing.assert_series_equal( + actual_result, expected_result, check_dtype=False, check_index=False + ) + + +def test_series_groupby_range_rolling(range_rolling_dfs): + bf_df, pd_df = range_rolling_dfs + bf_series = bf_df.set_index("ts_col")["int_col"] + pd_series = pd_df.set_index("ts_col")["int_col"] + + actual_result = ( + bf_series.sort_index() + .groupby(bf_series % 2 == 0) + .rolling(window="3s") + .min() + .to_pandas() + ) + + expected_result = ( + pd_series.sort_index().groupby(pd_series % 2 == 0).rolling(window="3s").min() + ) + pd.testing.assert_series_equal( + actual_result, expected_result, check_dtype=False, check_index=False + ) + + +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +@pytest.mark.parametrize( + "window", # skipped numpy timedelta because Pandas does not support it. + [pd.Timedelta("3s"), datetime.timedelta(seconds=3), "3s"], +) +@pytest.mark.parametrize("ascending", [True, False]) +def test_dataframe_range_rolling(range_rolling_dfs, window, closed, ascending): + bf_df, pd_df = range_rolling_dfs + bf_df = bf_df.set_index("ts_col") + pd_df = pd_df.set_index("ts_col") + + actual_result = ( + bf_df.sort_index(ascending=ascending) + .rolling(window=window, closed=closed) + .min() + .to_pandas() + ) + + expected_result = ( + pd_df.sort_index(ascending=ascending) + .rolling(window=window, closed=closed) + .min() + ) + # Need to cast Pandas index type. Otherwise it uses DatetimeIndex that + # does not exist in BigFrame + expected_result.index = expected_result.index.astype(dtypes.TIMESTAMP_DTYPE) + pd.testing.assert_frame_equal( + actual_result, + expected_result, + check_dtype=False, + ) + + +def test_dataframe_range_rolling_on(range_rolling_dfs): + bf_df, pd_df = range_rolling_dfs + on = "ts_col" + + actual_result = bf_df.sort_values(on).rolling(window="3s", on=on).min().to_pandas() + + expected_result = pd_df.sort_values(on).rolling(window="3s", on=on).min() + # Need to specify the column order because Pandas (seemingly) + # re-arranges columns alphabetically + cols = ["ts_col", "int_col", "float_col"] + pd.testing.assert_frame_equal( + actual_result[cols], + expected_result[cols], + check_dtype=False, + check_index_type=False, + ) + + +def test_dataframe_groupby_range_rolling(range_rolling_dfs): + bf_df, pd_df = range_rolling_dfs + on = "ts_col" + + actual_result = ( + bf_df.sort_values(on) + .groupby("int_col") + .rolling(window="3s", on=on) + .min() + .to_pandas() + ) + + expected_result = ( + pd_df.sort_values(on).groupby("int_col").rolling(window="3s", on=on).min() + ) + expected_result.index = expected_result.index.set_names("index", level=1) + pd.testing.assert_frame_equal( + actual_result, + expected_result, + check_dtype=False, + check_index_type=False, + ) + + +def test_range_rolling_order_info_lookup(range_rolling_dfs): + bf_df, pd_df = range_rolling_dfs + + actual_result = ( + bf_df.set_index("ts_col") + .sort_index(ascending=False)["int_col"] + .isin(bf_df["int_col"]) + .rolling(window="3s") + .count() + .to_pandas() + ) + + expected_result = ( + pd_df.set_index("ts_col") + .sort_index(ascending=False)["int_col"] + .isin(pd_df["int_col"]) + .rolling(window="3s") + .count() + ) + pd.testing.assert_series_equal( + actual_result, expected_result, check_dtype=False, check_index=False + ) + + +def test_range_rolling_unsupported_index_type_raise_error(range_rolling_dfs): + bf_df, _ = range_rolling_dfs + + with pytest.raises(ValueError): + bf_df["int_col"].sort_index().rolling(window="3s") + + +def test_range_rolling_unsorted_index_raise_error(range_rolling_dfs): + bf_df, _ = range_rolling_dfs + + with pytest.raises(ValueError): + bf_df.set_index("ts_col")["int_col"].rolling(window="3s") + + +def test_range_rolling_unsorted_column_raise_error(range_rolling_dfs): + bf_df, _ = range_rolling_dfs + + with pytest.raises(ValueError): + bf_df.rolling(window="3s", on="ts_col") diff --git a/tests/system/utils.py b/tests/system/utils.py index 891d813935..ecf9ae00f8 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -14,7 +14,6 @@ import base64 import decimal -import functools from typing import Iterable, Optional, Set, Union import geopandas as gpd # type: ignore @@ -66,16 +65,6 @@ ] -def skip_legacy_pandas(test): - @functools.wraps(test) - def wrapper(*args, **kwds): - if pd.__version__.startswith("1."): - pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.") - return test(*args, **kwds) - - return wrapper - - # Prefer this function for tests that run in both ordered and unordered mode def assert_dfs_equivalent( pd_df: pd.DataFrame, bf_df: bigframes.pandas.DataFrame, **kwargs diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index 98a74d4e4c..b8f3a612d4 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -183,3 +183,10 @@ def test_client_endpoints_override_set_shows_warning(): with pytest.warns(UserWarning): options.client_endpoints_override = {"bqclient": "endpoint_address"} + + +def test_default_options(): + options = bigquery_options.BigQueryOptions() + + assert options.allow_large_results is False + assert options.ordering_mode == "strict" diff --git a/tests/unit/_tools/__init__.py b/tests/unit/_tools/__init__.py new file mode 100644 index 0000000000..378d15c4be --- /dev/null +++ b/tests/unit/_tools/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for helper methods for processing Python objects with minimal dependencies. + +Please keep the dependencies used in this subpackage to a minimum to avoid the +risk of circular dependencies. +""" diff --git a/tests/unit/_tools/test_strings.py b/tests/unit/_tools/test_strings.py new file mode 100644 index 0000000000..9c83df2556 --- /dev/null +++ b/tests/unit/_tools/test_strings.py @@ -0,0 +1,149 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for helper methods for processing strings with minimal dependencies. + +Please keep the dependencies used in this subpackage to a minimum to avoid the +risk of circular dependencies. +""" + +import base64 +import random +import sys +import uuid + +import pytest + +from bigframes._tools import strings + +# To stress test some unicode comparisons. +# https://stackoverflow.com/a/39682429/101923 +ALL_UNICODE_CHARS = "".join(chr(i) for i in range(32, 0x110000) if chr(i).isprintable()) +RANDOM_STRINGS = ( + pytest.param(str(uuid.uuid4()), id="uuid4"), + pytest.param(hex(random.randint(0, sys.maxsize)), id="hex"), + pytest.param( + base64.b64encode( + "".join(random.choice(ALL_UNICODE_CHARS) for _ in range(100)).encode( + "utf-8" + ) + ).decode("utf-8"), + id="base64", + ), + pytest.param( + "".join(random.choice(ALL_UNICODE_CHARS) for _ in range(8)), id="unicode8" + ), + pytest.param( + "".join(random.choice(ALL_UNICODE_CHARS) for _ in range(64)), id="unicode64" + ), +) + + +def random_char_not_equal(avoid: str): + random_char = avoid + while random_char == avoid: + random_char = random.choice(ALL_UNICODE_CHARS) + return random_char + + +def random_deletion(original: str): + """original string with one character removed""" + char_index = random.randrange(len(original)) + return original[:char_index] + original[char_index + 1 :] + + +def random_insertion(original: str): + char_index = random.randrange(len(original)) + random_char = random.choice(ALL_UNICODE_CHARS) + return original[: char_index + 1] + random_char + original[char_index + 1 :] + + +@pytest.mark.parametrize( + ("left", "right", "expected"), + ( + ("", "", 0), + ("abc", "abc", 0), + # Deletions + ("abcxyz", "abc", 3), + ("xyzabc", "abc", 3), + ("AXYZBC", "ABC", 3), + ("AXYZBC", "XYZ", 3), + # Insertions + ("abc", "abcxyz", 3), + ("abc", "xyzabc", 3), + # Substitutions + ("abc", "aBc", 1), + ("abcxyz", "aBcXyZ", 3), + # Combinations + ("abcdefxyz", "abcExyzα", 4), + ), +) +def test_levenshtein_distance(left: str, right: str, expected: int): + assert strings.levenshtein_distance(left, right) == expected + + +@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS) +def test_levenshtein_distance_equal_strings(random_string: str): + """Mini fuzz test with different strings.""" + assert strings.levenshtein_distance(random_string, random_string) == 0 + + +@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS) +def test_levenshtein_distance_random_deletion(random_string: str): + """Mini fuzz test with different strings.""" + + num_deleted = random.randrange(1, min(10, len(random_string))) + assert 1 <= num_deleted < len(random_string) + + deleted = random_string + for _ in range(num_deleted): + deleted = random_deletion(deleted) + + assert deleted != random_string + assert len(deleted) == len(random_string) - num_deleted + assert strings.levenshtein_distance(random_string, deleted) == num_deleted + + +@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS) +def test_levenshtein_distance_random_insertion(random_string: str): + """Mini fuzz test with different strings.""" + + num_inserted = random.randrange(1, min(10, len(random_string))) + assert 1 <= num_inserted < len(random_string) + + inserted = random_string + for _ in range(num_inserted): + inserted = random_insertion(inserted) + + assert inserted != random_string + assert len(inserted) == len(random_string) + num_inserted + assert strings.levenshtein_distance(random_string, inserted) == num_inserted + + +@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS) +def test_levenshtein_distance_random_substitution(random_string: str): + """Mini fuzz test with different strings. + + Note: we don't do multiple substitutions here to avoid accidentally + substituting the same character twice. + """ + char_index = random.randrange(len(random_string)) + replaced_char = random_string[char_index] + random_char = random_char_not_equal(replaced_char) + substituted = ( + random_string[:char_index] + random_char + random_string[char_index + 1 :] + ) + assert substituted != random_string + assert len(substituted) == len(random_string) + assert strings.levenshtein_distance(random_string, substituted) == 1 diff --git a/tests/unit/core/compile/sqlglot/__init__.py b/tests/unit/core/compile/sqlglot/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/test_sqlglot_types.py b/tests/unit/core/compile/sqlglot/test_sqlglot_types.py new file mode 100644 index 0000000000..a9108e5daf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_sqlglot_types.py @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pyarrow as pa + +import bigframes.core.compile.sqlglot.sqlglot_types as sgt +import bigframes.dtypes as dtypes + + +def test_from_bigframes_simple_dtypes(): + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.INT_DTYPE) == "INT64" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.FLOAT_DTYPE) == "FLOAT64" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.STRING_DTYPE) == "STRING" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.BOOL_DTYPE) == "BOOLEAN" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.DATE_DTYPE) == "DATE" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.TIME_DTYPE) == "TIME" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.DATETIME_DTYPE) == "DATETIME" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.TIMESTAMP_DTYPE) == "TIMESTAMP" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.BYTES_DTYPE) == "BYTES" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.NUMERIC_DTYPE) == "NUMERIC" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.BIGNUMERIC_DTYPE) == "BIGNUMERIC" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.JSON_DTYPE) == "JSON" + assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.GEO_DTYPE) == "GEOGRAPHY" + + +def test_from_bigframes_struct_dtypes(): + fields = [pa.field("int_col", pa.int64()), pa.field("bool_col", pa.bool_())] + struct_type = pd.ArrowDtype(pa.struct(fields)) + expected = "STRUCT" + assert sgt.SQLGlotType.from_bigframes_dtype(struct_type) == expected + + +def test_from_bigframes_array_dtypes(): + int_array_type = pd.ArrowDtype(pa.list_(pa.int64())) + assert sgt.SQLGlotType.from_bigframes_dtype(int_array_type) == "ARRAY" + + string_array_type = pd.ArrowDtype(pa.list_(pa.string())) + assert sgt.SQLGlotType.from_bigframes_dtype(string_array_type) == "ARRAY" + + +def test_from_bigframes_multi_nested_dtypes(): + fields = [ + pa.field("string_col", pa.string()), + pa.field("date_col", pa.date32()), + pa.field("array_col", pa.list_(pa.timestamp("us"))), + ] + array_type = pd.ArrowDtype(pa.list_(pa.struct(fields))) + + expected = ( + "ARRAY>>" + ) + assert sgt.SQLGlotType.from_bigframes_dtype(array_type) == expected diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index fb5a927e76..b1b276bda3 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -20,7 +20,7 @@ import bigframes import bigframes.core.blocks as blocks -import bigframes.session.executor +import bigframes.session.bq_caching_executor @pytest.mark.parametrize( @@ -80,7 +80,7 @@ def test_block_from_local(data): expected = pandas.DataFrame(data) mock_session = mock.create_autospec(spec=bigframes.Session) mock_executor = mock.create_autospec( - spec=bigframes.session.executor.BigQueryCachingExecutor + spec=bigframes.session.bq_caching_executor.BigQueryCachingExecutor ) # hard-coded the returned dimension of the session for that each of the test case contains 3 rows. diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index 6bc9c91f3a..811c64a27b 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -40,8 +40,15 @@ def method1(self): pass def method2(self): + self.method3() + + def method3(self): pass + @property + def my_field(self): + return 0 + return TestClass() @@ -51,9 +58,49 @@ def test_method_logging(test_instance): # Check if the methods were added to the _api_methods list api_methods = log_adapter.get_and_reset_api_methods() - assert api_methods is not None assert "testclass-method1" in api_methods assert "testclass-method2" in api_methods + assert "testclass-method3" not in api_methods + + +def test_property_logging(test_instance): + test_instance.my_field + + # Check if the properties were added to the _api_methods list + api_methods = log_adapter.get_and_reset_api_methods() + assert "testclass-my_field" in api_methods + + +def test_method_logging__include_internal_calls(): + @log_adapter.class_logger(include_internal_calls=True) + class TestClass: + def public_method(self): + self._internal_method() + + def _internal_method(self): + pass + + TestClass().public_method() + + api_methods = log_adapter.get_and_reset_api_methods() + assert "testclass-public_method" in api_methods + assert "testclass-_internal_method" in api_methods + + +def test_method_logging__exclude_internal_calls(): + @log_adapter.class_logger(include_internal_calls=False) + class TestClass: + def public_method(self): + self._internal_method() + + def _internal_method(self): + pass + + TestClass().public_method() + + api_methods = log_adapter.get_and_reset_api_methods() + assert "testclass-public_method" in api_methods + assert "testclass-_internal_method" not in api_methods def test_add_api_method_limit(test_instance): diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py index ca286cafff..913a5b61fe 100644 --- a/tests/unit/core/test_sql.py +++ b/tests/unit/core/test_sql.py @@ -14,15 +14,16 @@ import datetime import decimal +import re import pytest -import shapely # type: ignore +import shapely.geometry # type: ignore from bigframes.core import sql @pytest.mark.parametrize( - ("value", "expected"), + ("value", "expected_pattern"), ( # Try to have some literals for each scalar data type: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types @@ -32,44 +33,44 @@ (False, "False"), ( b"\x01\x02\x03ABC", - r"b'\x01\x02\x03ABC'", + re.escape(r"b'\x01\x02\x03ABC'"), ), ( datetime.date(2025, 1, 1), - "DATE('2025-01-01')", + re.escape("DATE('2025-01-01')"), ), ( datetime.datetime(2025, 1, 2, 3, 45, 6, 789123), - "DATETIME('2025-01-02T03:45:06.789123')", + re.escape("DATETIME('2025-01-02T03:45:06.789123')"), ), ( - shapely.Point(0, 1), - "ST_GEOGFROMTEXT('POINT (0 1)')", + shapely.geometry.Point(0, 1), + r"ST_GEOGFROMTEXT\('POINT \(0[.]?0* 1[.]?0*\)'\)", ), # TODO: INTERVAL type (e.g. from dateutil.relativedelta) # TODO: JSON type (TBD what Python object that would correspond to) - (123, "123"), - (decimal.Decimal("123.75"), "CAST('123.75' AS NUMERIC)"), + (123, re.escape("123")), + (decimal.Decimal("123.75"), re.escape("CAST('123.75' AS NUMERIC)")), # TODO: support BIGNUMERIC by looking at precision/scale of the DECIMAL - (123.75, "123.75"), + (123.75, re.escape("123.75")), # TODO: support RANGE type - ("abc", "'abc'"), + ("abc", re.escape("'abc'")), # TODO: support STRUCT type (possibly another method?) ( datetime.time(12, 34, 56, 789123), - "TIME(DATETIME('1970-01-01 12:34:56.789123'))", + re.escape("TIME(DATETIME('1970-01-01 12:34:56.789123'))"), ), ( datetime.datetime( 2025, 1, 2, 3, 45, 6, 789123, tzinfo=datetime.timezone.utc ), - "TIMESTAMP('2025-01-02T03:45:06.789123+00:00')", + re.escape("TIMESTAMP('2025-01-02T03:45:06.789123+00:00')"), ), ), ) -def test_simple_literal(value, expected): +def test_simple_literal(value, expected_pattern): got = sql.simple_literal(value) - assert got == expected + assert re.match(expected_pattern, got) is not None def test_create_vector_search_sql_simple(): diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index d377fb4d49..259a4390bc 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -19,11 +19,10 @@ import pandas import pytest -import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.functions.function as bff import bigframes.series -from tests.unit import resources +from bigframes.testing import mocks @pytest.mark.parametrize( @@ -41,8 +40,10 @@ ) def test_series_input_types_to_str(series_type): """Check that is_row_processor=True uses str as the input type to serialize a row.""" - session = resources.create_bigquery_session() - remote_function_decorator = bff.remote_function(session=session) + session = mocks.create_bigquery_session() + remote_function_decorator = bff.remote_function( + session=session, cloud_function_service_account="default" + ) with pytest.warns( bigframes.exceptions.PreviewWarning, @@ -78,8 +79,10 @@ def test_supported_types_correspond(): def test_missing_input_types(): - session = resources.create_bigquery_session() - remote_function_decorator = bff.remote_function(session=session) + session = mocks.create_bigquery_session() + remote_function_decorator = bff.remote_function( + session=session, cloud_function_service_account="default" + ) def function_without_parameter_annotations(myparam) -> str: return str(myparam) @@ -94,8 +97,10 @@ def function_without_parameter_annotations(myparam) -> str: def test_missing_output_type(): - session = resources.create_bigquery_session() - remote_function_decorator = bff.remote_function(session=session) + session = mocks.create_bigquery_session() + remote_function_decorator = bff.remote_function( + session=session, cloud_function_service_account="default" + ) def function_without_return_annotation(myparam: int): return str(myparam) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 450ce8d6ee..86cbb111f4 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -281,7 +281,7 @@ def test_customtransformer_compile_sql(mock_X): ] -def create_bq_model_mock(mocker, transform_columns, feature_columns=None): +def create_bq_model_mock(monkeypatch, transform_columns, feature_columns=None): properties = {"transformColumns": transform_columns} mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) @@ -289,18 +289,19 @@ def create_bq_model_mock(mocker, transform_columns, feature_columns=None): result = [ bigquery.standard_sql.StandardSqlField(col, None) for col in feature_columns ] - mocker.patch( - "google.cloud.bigquery.model.Model.feature_columns", - new_callable=mock.PropertyMock(return_value=result), + monkeypatch.setattr( + type(mock_bq_model), + "feature_columns", + mock.PropertyMock(return_value=result), ) return mock_bq_model @pytest.fixture -def bq_model_good(mocker): +def bq_model_good(monkeypatch): return create_bq_model_mock( - mocker, + monkeypatch, [ { "name": "ident_culmen_length_mm", @@ -337,9 +338,9 @@ def bq_model_good(mocker): @pytest.fixture -def bq_model_merge(mocker): +def bq_model_merge(monkeypatch): return create_bq_model_mock( - mocker, + monkeypatch, [ { "name": "labelencoded_county", @@ -357,9 +358,9 @@ def bq_model_merge(mocker): @pytest.fixture -def bq_model_no_merge(mocker): +def bq_model_no_merge(monkeypatch): return create_bq_model_mock( - mocker, + monkeypatch, [ { "name": "ident_culmen_length_mm", @@ -372,9 +373,9 @@ def bq_model_no_merge(mocker): @pytest.fixture -def bq_model_unknown_ML(mocker): +def bq_model_unknown_ML(monkeypatch): return create_bq_model_mock( - mocker, + monkeypatch, [ { "name": "unknownml_culmen_length_mm", @@ -391,9 +392,9 @@ def bq_model_unknown_ML(mocker): @pytest.fixture -def bq_model_flexnames(mocker): +def bq_model_flexnames(monkeypatch): return create_bq_model_mock( - mocker, + monkeypatch, [ { "name": "Flex Name culmen_length_mm", diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index c9d147e18f..62cfe09704 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -17,10 +17,10 @@ from google.cloud import bigquery import pandas as pd import pytest -import pytest_mock import bigframes -from bigframes.ml import core, linear_model +from bigframes.ml import core, decomposition, linear_model +import bigframes.ml.core import bigframes.pandas as bpd TEMP_MODEL_ID = bigquery.ModelReference.from_string( @@ -50,10 +50,11 @@ def mock_session(): @pytest.fixture -def bqml_model_factory(mocker: pytest_mock.MockerFixture): - mocker.patch( - "bigframes.ml.core.BqmlModelFactory._create_model_ref", - return_value=TEMP_MODEL_ID, +def bqml_model_factory(monkeypatch): + monkeypatch.setattr( + bigframes.ml.core.BqmlModelFactory, + "_create_model_ref", + mock.Mock(return_value=TEMP_MODEL_ID), ) bqml_model_factory = core.BqmlModelFactory() @@ -80,6 +81,7 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + mock_X.reset_index(drop=True).cache().sql = "input_X_no_index_sql" mock_X.join(mock_y).sql = "input_X_y_sql" mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) mock_X.join(mock_y)._to_sql_query.return_value = ( @@ -209,3 +211,55 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_y_sql))" ) + + +def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X): + model = decomposition.MatrixFactorization( + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model_factory = bqml_model_factory + model.fit(mock_X) + + mock_session._start_query_ml_ddl.assert_called_once_with( + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='matrix_factorization',\n feedback_type='explicit',\n user_col='user_id',\n item_col='item_col',\n rating_col='rating_col',\n l2_reg=9.83,\n num_factors=34)\nAS input_X_no_index_sql" + ) + + +def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): + model = decomposition.MatrixFactorization( + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model = bqml_model + model.predict(mock_X) + + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", + index_col=["index_column_id"], + ) + + +def test_decomposition_mf_score(mock_session, bqml_model, mock_X): + model = decomposition.MatrixFactorization( + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model = bqml_model + model.score(mock_X) + + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)" + ) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py new file mode 100644 index 0000000000..92691ba9d4 --- /dev/null +++ b/tests/unit/ml/test_matrix_factorization.py @@ -0,0 +1,182 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest + +from bigframes.ml import decomposition + + +def test_decomposition_mf_model(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="implicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9, + ) + assert model.num_factors == 16 + assert model.feedback_type == "implicit" + assert model.user_col == "user_id" + assert model.item_col == "item_col" + assert model.rating_col == "rating_col" + + +def test_decomposition_mf_feedback_type_explicit(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + assert model.feedback_type == "explicit" + + +def test_decomposition_mf_invalid_feedback_type_raises(): + feedback_type = "explimp" + with pytest.raises( + ValueError, + match="Expected feedback_type to be `explicit` or `implicit`.", + ): + decomposition.MatrixFactorization( + # Intentionally pass in the wrong type. This will fail if the user is using + # a type checker, but we can't assume that everyone is doing so, especially + # not in notebook environments. + num_factors=16, + feedback_type=feedback_type, # type: ignore + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_num_factors_low(): + model = decomposition.MatrixFactorization( + num_factors=0, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + assert model.num_factors == 0 + + +def test_decomposition_mf_negative_num_factors_raises(): + num_factors = -2 + with pytest.raises( + ValueError, + match=f"Expected num_factors to be a positive integer, but got {num_factors}.", + ): + decomposition.MatrixFactorization( + num_factors=num_factors, # type: ignore + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_invalid_num_factors_raises(): + num_factors = 0.5 + with pytest.raises( + TypeError, + match=f"Expected num_factors to be an int, but got {type(num_factors)}.", + ): + decomposition.MatrixFactorization( + num_factors=num_factors, # type: ignore + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_invalid_user_col_raises(): + user_col = 123 + with pytest.raises( + TypeError, match=f"Expected user_col to be a str, but got {type(user_col)}." + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col=user_col, # type: ignore + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_invalid_item_col_raises(): + item_col = 123 + with pytest.raises( + TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}." + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col=item_col, # type: ignore + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_invalid_rating_col_raises(): + rating_col = 4 + with pytest.raises( + TypeError, match=f"Expected rating_col to be a str, but got {type(rating_col)}." + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col=rating_col, # type: ignore + l2_reg=9.83, + ) + + +def test_decomposition_mf_l2_reg(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=6.02, # type: ignore + ) + assert model.l2_reg == 6.02 + + +def test_decomposition_mf_invalid_l2_reg_raises(): + l2_reg = "6.02" + with pytest.raises( + TypeError, + match=f"Expected l2_reg to be a float or int, but got {type(l2_reg)}.", + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=l2_reg, # type: ignore + ) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index fa05fffcb2..af2c7714ab 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -15,6 +15,7 @@ import datetime import re from typing import Iterable +from unittest import mock import google.cloud.bigquery as bigquery import pytest @@ -23,14 +24,14 @@ from bigframes.core import log_adapter import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq -from tests.unit import resources +from bigframes.testing import mocks @pytest.fixture(scope="function") -def mock_bq_client(mocker): - mock_client = mocker.Mock(spec=bigquery.Client) - mock_query_job = mocker.Mock(spec=bigquery.QueryJob) - mock_row_iterator = mocker.Mock(spec=bigquery.table.RowIterator) +def mock_bq_client(): + mock_client = mock.create_autospec(bigquery.Client) + mock_query_job = mock.create_autospec(bigquery.QueryJob) + mock_row_iterator = mock.create_autospec(bigquery.table.RowIterator) mock_query_job.result.return_value = mock_row_iterator @@ -97,7 +98,7 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): "source": "bigquery-dataframes-temp", } df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) # Test running two methods df.head() @@ -121,7 +122,7 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): log_adapter.get_and_reset_api_methods() df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) # Test running methods more than the labels' length limit for i in range(100): @@ -148,7 +149,7 @@ def test_create_job_configs_labels_length_limit_met(): cur_labels[key] = value # If cur_labels length is 62, we can only add one label from api_methods df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) # Test running two methods df.head() @@ -178,7 +179,7 @@ def test_add_and_trim_labels_length_limit_met(): cur_labels[key] = value df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) job_config = bigquery.job.QueryJobConfig() @@ -215,7 +216,7 @@ def test_start_query_with_client_labels_length_limit_met( cur_labels[key] = value df = bpd.DataFrame( - {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + {"col1": [1, 2], "col2": [3, 4]}, session=mocks.create_bigquery_session() ) job_config = bigquery.job.QueryJobConfig() @@ -248,7 +249,7 @@ def test_create_temp_table_default_expiration(): 2023, 11, 2, 13, 44, 55, 678901, datetime.timezone.utc ) - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() table_ref = bigquery.TableReference.from_string( "test-project.test_dataset.bqdf_new_random_table" ) diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index 2fa07aed35..224f343c7e 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -29,8 +29,7 @@ import bigframes.features import bigframes.pandas import bigframes.session._io.pandas - -from .. import resources +from bigframes.testing import mocks _LIST_OF_SCALARS = [ [1, 2, 3], @@ -496,7 +495,7 @@ def test_arrow_to_pandas_wrong_size_dtypes( def test_read_pandas_with_bigframes_dataframe(): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() df = mock.create_autospec(bigframes.pandas.DataFrame, instance=True) with pytest.raises( diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 8f01820fd3..a56b4ed7ab 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -20,8 +20,7 @@ import pytest import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table - -from .. import resources +from bigframes.testing import mocks @pytest.mark.parametrize( @@ -87,7 +86,7 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte bqclient.query_and_wait.return_value = ( {"total_count": 3, "distinct_count": 3 if values_distinct else 2}, ) - session = resources.create_bigquery_session( + session = mocks.create_bigquery_session( bqclient=bqclient, table_schema=table.schema ) table._properties["location"] = session._location diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index b35449f291..490ffc4108 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -21,16 +21,14 @@ import google.api_core.exceptions import google.cloud.bigquery -import google.cloud.bigquery.table import pandas as pd -import pyarrow as pa import pytest import bigframes from bigframes import version import bigframes.enums import bigframes.exceptions -from tests.unit import resources +from bigframes.testing import mocks TABLE_REFERENCE = { "projectId": "my-project", @@ -137,8 +135,8 @@ ), ], ) -def test_read_csv_bq_engine_throws_not_implemented_error(kwargs, match): - session = resources.create_bigquery_session() +def test_read_csv_w_bq_engine_raises_error(kwargs, match): + session = mocks.create_bigquery_session() with pytest.raises(NotImplementedError, match=match): session.read_csv("", **kwargs) @@ -150,10 +148,11 @@ def test_read_csv_bq_engine_throws_not_implemented_error(kwargs, match): ("c",), ("python",), ("pyarrow",), + ("python-fwf",), ), ) -def test_read_csv_pandas_engines_index_col_sequential_int64_not_supported(engine): - session = resources.create_bigquery_session() +def test_read_csv_w_pandas_engines_raises_error_for_sequential_int64_index_col(engine): + session = mocks.create_bigquery_session() with pytest.raises(NotImplementedError, match="index_col"): session.read_csv( @@ -163,6 +162,22 @@ def test_read_csv_pandas_engines_index_col_sequential_int64_not_supported(engine ) +@pytest.mark.parametrize( + ("kwargs"), + [ + pytest.param({"chunksize": 5}, id="with_chunksize"), + pytest.param({"iterator": True}, id="with_iterator"), + ], +) +def test_read_csv_w_pandas_engines_raises_error_for_unsupported_args(kwargs): + session = mocks.create_bigquery_session() + with pytest.raises( + NotImplementedError, + match="'chunksize' and 'iterator' arguments are not supported.", + ): + session.read_csv("path/to/csv.csv", **kwargs) + + @pytest.mark.parametrize( ("engine", "write_engine"), ( @@ -178,7 +193,7 @@ def test_read_csv_pandas_engines_index_col_sequential_int64_not_supported(engine ), ) def test_read_csv_with_incompatible_write_engine(engine, write_engine): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() with pytest.raises( NotImplementedError, @@ -195,14 +210,14 @@ def test_read_csv_with_incompatible_write_engine(engine, write_engine): @pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")]) def test_read_gbq_missing_parts(missing_parts_table_id): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() with pytest.raises(ValueError): session.read_gbq(missing_parts_table_id) def test_read_gbq_cached_table(): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() table_ref = google.cloud.bigquery.TableReference( google.cloud.bigquery.DatasetReference("my-project", "my_dataset"), "my_table", @@ -245,7 +260,7 @@ def test_default_index_warning_raised_by_read_gbq(table): bqclient.project = "test-project" bqclient.get_table.return_value = table bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},) - session = resources.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location with pytest.warns(bigframes.exceptions.DefaultIndexWarning): @@ -268,7 +283,7 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 bqclient.project = "test-project" bqclient.get_table.return_value = table bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},) - session = resources.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location # No warnings raised because we set the option allowing the default indexes. @@ -315,7 +330,7 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( bqclient.query_and_wait.return_value = ( {"total_count": total_count, "distinct_count": distinct_count}, ) - session = resources.create_bigquery_session( + session = mocks.create_bigquery_session( bqclient=bqclient, table_schema=table.schema ) table._properties["location"] = session._location @@ -357,7 +372,7 @@ def test_default_index_warning_not_raised_by_read_gbq_primary_key(table): bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - session = resources.create_bigquery_session( + session = mocks.create_bigquery_session( bqclient=bqclient, table_schema=table.schema ) table._properties["location"] = session._location @@ -382,7 +397,7 @@ def test_read_gbq_not_found_tables(not_found_table_id): bqclient.get_table.side_effect = google.api_core.exceptions.NotFound( "table not found" ) - session = resources.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session(bqclient=bqclient) with pytest.raises(google.api_core.exceptions.NotFound): session.read_gbq(not_found_table_id) @@ -404,7 +419,7 @@ def test_read_gbq_not_found_tables(not_found_table_id): ], ) def test_read_gbq_external_table_no_drive_access(api_name, query_or_table): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() session_query_mock = session.bqclient.query def query_mock(query, *args, **kwargs): @@ -459,12 +474,12 @@ def today(cls): monkeypatch.setattr(datetime, "datetime", FakeDatetime) with pytest.warns(bigframes.exceptions.ObsoleteVersionWarning): - resources.create_bigquery_session() + mocks.create_bigquery_session() @mock.patch("bigframes.session.MAX_INLINE_DF_BYTES", 1) def test_read_pandas_inline_exceeds_limit_raises_error(): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises( ValueError, @@ -474,20 +489,7 @@ def test_read_pandas_inline_exceeds_limit_raises_error(): def test_read_pandas_inline_w_interval_type_raises_error(): - session = resources.create_bigquery_session() + session = mocks.create_bigquery_session() df = pd.DataFrame(pd.arrays.IntervalArray.from_breaks([0, 10, 20, 30, 40, 50])) - with pytest.raises(ValueError, match="Could not convert with a BigQuery type: "): + with pytest.raises(TypeError): session.read_pandas(df, write_engine="bigquery_inline") - - -def test_read_pandas_inline_w_noninlineable_type_raises_error(): - session = resources.create_bigquery_session() - data = [ - [1, 2, 3], - [4, 5], - None, - [6, 7, 8, 9], - ] - s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - with pytest.raises(ValueError, match="Could not inline with a BigQuery type:"): - session.read_pandas(s, write_engine="bigquery_inline") diff --git a/tests/unit/session/test_time.py b/tests/unit/session/test_time.py index 87766e79bb..39a231c3ce 100644 --- a/tests/unit/session/test_time.py +++ b/tests/unit/session/test_time.py @@ -15,7 +15,6 @@ import datetime import unittest.mock as mock -import freezegun import google.cloud.bigquery import pytest @@ -47,6 +46,8 @@ def query_and_wait_mock(query, *args, **kwargs): def test_bqsyncedclock_get_time(bq_client): + freezegun = pytest.importorskip("freezegun") + # this initial local time is actually irrelevant, only the ticks matter initial_local_datetime = datetime.datetime( year=1, month=7, day=12, hour=15, minute=6, second=3 diff --git a/tests/unit/test_clients.py b/tests/unit/test_clients.py index 37450ececb..032512c26e 100644 --- a/tests/unit/test_clients.py +++ b/tests/unit/test_clients.py @@ -17,33 +17,51 @@ from bigframes import clients -def test_get_connection_name_full_connection_id(): - connection_name = clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_connection_id_only(): + connection_id = clients.get_canonical_bq_connection_id( "connection-id", default_project="default-project", default_location="us" ) - assert connection_name == "default-project.us.connection-id" + assert connection_id == "default-project.us.connection-id" -def test_get_connection_name_full_location_connection_id(): - connection_name = clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_location_and_connection_id(): + connection_id = clients.get_canonical_bq_connection_id( "eu.connection-id", default_project="default-project", default_location="us" ) - assert connection_name == "default-project.eu.connection-id" + assert connection_id == "default-project.eu.connection-id" -def test_get_connection_name_full_all(): - connection_name = clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_already_canonical(): + connection_id = clients.get_canonical_bq_connection_id( "my-project.eu.connection-id", default_project="default-project", default_location="us", ) - assert connection_name == "my-project.eu.connection-id" + assert connection_id == "my-project.eu.connection-id" -def test_get_connection_name_full_raise_value_error(): - with pytest.raises(ValueError): - clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_invalid(): + with pytest.raises(ValueError, match="Invalid connection id format"): + clients.get_canonical_bq_connection_id( "my-project.eu.connection-id.extra_field", default_project="default-project", default_location="us", ) + + +def test_get_canonical_bq_connection_id_valid_path(): + connection_id = clients.get_canonical_bq_connection_id( + "projects/project_id/locations/northamerica-northeast1/connections/connection-id", + default_project="default-project", + default_location="us", + ) + assert connection_id == "project_id.northamerica-northeast1.connection-id" + + +def test_get_canonical_bq_connection_id_invalid_path(): + with pytest.raises(ValueError, match="Invalid connection id format"): + clients.get_canonical_bq_connection_id( + "/projects/project_id/locations/northamerica-northeast1/connections/connection-id", + default_project="default-project", + default_location="us", + ) diff --git a/tests/unit/test_daemon.py b/tests/unit/test_daemon.py new file mode 100644 index 0000000000..6b3acd7d7d --- /dev/null +++ b/tests/unit/test_daemon.py @@ -0,0 +1,42 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import time +from unittest.mock import MagicMock + +from bigframes.session.bigquery_session import RecurringTaskDaemon + + +def test_recurring_task_daemon_calls(): + mock_task = MagicMock() + daemon = RecurringTaskDaemon( + task=mock_task, frequency=datetime.timedelta(seconds=0.1) + ) + daemon.start() + time.sleep(1.0) + daemon.stop() + time.sleep(0.5) + # be lenient, but number of calls should be in this ballpark regardless of scheduling hiccups + assert mock_task.call_count > 6 + assert mock_task.call_count < 12 + + +def test_recurring_task_daemon_never_started(): + mock_task = MagicMock() + _ = RecurringTaskDaemon( + task=mock_task, frequency=datetime.timedelta(seconds=0.0001) + ) + time.sleep(0.1) + assert mock_task.call_count == 0 diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 8d1e77510a..9d67fd33b7 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -16,14 +16,13 @@ import pytest import bigframes.dataframe - -from . import resources +from bigframes.testing import mocks def test_dataframe_dropna_axis_1_subset_not_implememented( monkeypatch: pytest.MonkeyPatch, ): - dataframe = resources.create_dataframe(monkeypatch) + dataframe = mocks.create_dataframe(monkeypatch) with pytest.raises(NotImplementedError, match="subset"): dataframe.dropna(axis=1, subset=["col1", "col2"]) @@ -51,14 +50,14 @@ def test_dataframe_setattr_with_uninitialized_object(): def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch): - dataframe = resources.create_dataframe(monkeypatch) + dataframe = mocks.create_dataframe(monkeypatch) with pytest.raises(ValueError, match="no_dataset_or_project"): dataframe.to_gbq("no_dataset_or_project") def test_dataframe_to_gbq_invalid_if_exists(monkeypatch: pytest.MonkeyPatch): - dataframe = resources.create_dataframe(monkeypatch) + dataframe = mocks.create_dataframe(monkeypatch) with pytest.raises(ValueError, match="notreallyanoption"): # Even though the type is annotated with the literals we accept, users @@ -70,7 +69,7 @@ def test_dataframe_to_gbq_invalid_if_exists(monkeypatch: pytest.MonkeyPatch): def test_dataframe_to_gbq_invalid_if_exists_no_destination( monkeypatch: pytest.MonkeyPatch, ): - dataframe = resources.create_dataframe(monkeypatch) + dataframe = mocks.create_dataframe(monkeypatch) with pytest.raises(ValueError, match="append"): dataframe.to_gbq(if_exists="append") @@ -83,8 +82,8 @@ def test_dataframe_to_gbq_writes_to_anonymous_dataset( anonymous_dataset = google.cloud.bigquery.DatasetReference.from_string( anonymous_dataset_id ) - session = resources.create_bigquery_session(anonymous_dataset=anonymous_dataset) - dataframe = resources.create_dataframe(monkeypatch, session=session) + session = mocks.create_bigquery_session(anonymous_dataset=anonymous_dataset) + dataframe = mocks.create_dataframe(monkeypatch, session=session) destination = dataframe.to_gbq() @@ -94,7 +93,7 @@ def test_dataframe_to_gbq_writes_to_anonymous_dataset( def test_dataframe_semantics_property_future_warning( monkeypatch: pytest.MonkeyPatch, ): - dataframe = resources.create_dataframe(monkeypatch) + dataframe = mocks.create_dataframe(monkeypatch) with bigframes.option_context("experiments.semantic_operators", True), pytest.warns( FutureWarning diff --git a/tests/unit/test_dataframe_io.py b/tests/unit/test_dataframe_io.py index 5deb0d7a24..7845a71134 100644 --- a/tests/unit/test_dataframe_io.py +++ b/tests/unit/test_dataframe_io.py @@ -12,17 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import Mock +from unittest import mock import pytest -from . import resources +from bigframes.testing import mocks @pytest.fixture def mock_df(monkeypatch: pytest.MonkeyPatch): - dataframe = resources.create_dataframe(monkeypatch) - monkeypatch.setattr(dataframe, "to_pandas", Mock()) + dataframe = mocks.create_dataframe(monkeypatch) + monkeypatch.setattr(dataframe, "to_pandas", mock.Mock()) return dataframe diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 4697c84960..d4e0dae1f3 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -19,9 +19,9 @@ import bigframes import bigframes.pandas as bpd -from tests.system.utils import skip_legacy_pandas pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") # All tests in this file require polars to be installed to pass. @@ -50,8 +50,6 @@ def small_inline_frame() -> pd.DataFrame: return df -# These tests should be unit tests, but Session object is tightly coupled to BigQuery client. -@skip_legacy_pandas def test_polars_local_engine_add( small_inline_frame: pd.DataFrame, polars_session: bigframes.Session ): @@ -63,7 +61,6 @@ def test_polars_local_engine_add( pandas.testing.assert_series_equal(bf_result, pd_result) -@skip_legacy_pandas def test_polars_local_engine_order_by(small_inline_frame: pd.DataFrame, polars_session): pd_df = small_inline_frame bf_df = bpd.DataFrame(pd_df, session=polars_session) @@ -73,7 +70,6 @@ def test_polars_local_engine_order_by(small_inline_frame: pd.DataFrame, polars_s pandas.testing.assert_frame_equal(bf_result, pd_result) -@skip_legacy_pandas def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_session): pd_df = small_inline_frame bf_df = bpd.DataFrame(pd_df, session=polars_session) @@ -83,7 +79,6 @@ def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_ses pandas.testing.assert_frame_equal(bf_result, pd_result) -@skip_legacy_pandas def test_polars_local_engine_reset_index( small_inline_frame: pd.DataFrame, polars_session ): @@ -96,7 +91,6 @@ def test_polars_local_engine_reset_index( pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) -@skip_legacy_pandas def test_polars_local_engine_join_binop(polars_session): pd_df_1 = pd.DataFrame({"colA": [1, None, 3], "colB": [3, 1, 2]}, index=[1, 2, 3]) pd_df_2 = pd.DataFrame( @@ -116,7 +110,6 @@ def test_polars_local_engine_join_binop(polars_session): ) -@skip_legacy_pandas @pytest.mark.parametrize( "join_type", ["inner", "left", "right", "outer"], @@ -139,7 +132,6 @@ def test_polars_local_engine_joins(join_type, polars_session): ) -@skip_legacy_pandas def test_polars_local_engine_agg(polars_session): pd_df = pd.DataFrame( {"colA": [True, False, True, False, True], "colB": [1, 2, 3, 4, 5]} @@ -152,7 +144,6 @@ def test_polars_local_engine_agg(polars_session): pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False, check_index_type=False) # type: ignore -@skip_legacy_pandas def test_polars_local_engine_groupby_sum(polars_session): pd_df = pd.DataFrame( {"colA": [True, False, True, False, True], "colB": [1, 2, 3, 4, 5]} @@ -166,7 +157,6 @@ def test_polars_local_engine_groupby_sum(polars_session): ) -@skip_legacy_pandas def test_polars_local_engine_cumsum(small_inline_frame, polars_session): pd_df = small_inline_frame[["int1", "int2"]] bf_df = bpd.DataFrame(pd_df, session=polars_session) @@ -176,7 +166,6 @@ def test_polars_local_engine_cumsum(small_inline_frame, polars_session): pandas.testing.assert_frame_equal(bf_result, pd_result) -@skip_legacy_pandas def test_polars_local_engine_explode(small_inline_frame, polars_session): pd_df = small_inline_frame bf_df = bpd.DataFrame(pd_df, session=polars_session) @@ -206,7 +195,6 @@ def test_polars_local_engine_explode(small_inline_frame, polars_session): (7, -7, -2), ], ) -@skip_legacy_pandas def test_polars_local_engine_slice( small_inline_frame, polars_session, start, stop, step ): diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 64a287aaca..e8383512a6 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -91,13 +91,48 @@ def test_method_matches_session(method_name: str): assert pandas_signature.return_annotation == session_signature.return_annotation -def test_cut_raises_with_labels(): +@pytest.mark.parametrize( + ("bins", "labels", "error_message"), + [ + pytest.param( + 5, + True, + "Bin labels must either be False, None or passed in as a list-like argument", + id="true", + ), + pytest.param( + 5, + 1.5, + "Bin labels must either be False, None or passed in as a list-like argument", + id="invalid_types", + ), + pytest.param( + 2, + ["A"], + "must be same as the value of bins", + id="int_bins_mismatch", + ), + pytest.param( + [1, 2, 3], + ["A"], + "must be same as the number of bin edges", + id="iterator_bins_mismatch", + ), + ], +) +def test_cut_raises_with_invalid_labels(bins: int, labels, error_message: str): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + with pytest.raises(ValueError, match=error_message): + bigframes.pandas.cut(mock_series, bins, labels=labels) + + +def test_cut_raises_with_unsupported_labels(): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + labels = [1, 2] with pytest.raises( - NotImplementedError, - match="The 'labels' parameter must be either False or None.", + NotImplementedError, match=r".*only iterables of strings are supported.*" ): - mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) - bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) + bigframes.pandas.cut(mock_series, 2, labels=labels) # type: ignore @pytest.mark.parametrize( @@ -111,11 +146,21 @@ def test_cut_raises_with_labels(): "`bins` iterable should contain tuples or numerics", id="iterable_w_wrong_type", ), + pytest.param( + [10, 3], + "left side of interval must be <= right side", + id="decreased_breaks", + ), + pytest.param( + [(1, 10), (2, 25)], + "Overlapping IntervalIndex is not accepted.", + id="overlapping_intervals", + ), ], ) def test_cut_raises_with_invalid_bins(bins: int, error_message: str): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) with pytest.raises(ValueError, match=error_message): - mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, bins, labels=False) diff --git a/tests/unit/test_series_io.py b/tests/unit/test_series_io.py index a97293d3da..bb0ea15053 100644 --- a/tests/unit/test_series_io.py +++ b/tests/unit/test_series_io.py @@ -12,18 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import Mock +from unittest import mock import pytest -from . import resources +from bigframes.testing import mocks @pytest.fixture def mock_series(monkeypatch: pytest.MonkeyPatch): - dataframe = resources.create_dataframe(monkeypatch) + dataframe = mocks.create_dataframe(monkeypatch) series = dataframe["col"] - monkeypatch.setattr(series, "to_pandas", Mock()) + monkeypatch.setattr(series, "to_pandas", mock.Mock()) return series diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index e75bdf81e0..4ad4f383cf 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -45,11 +45,11 @@ def x(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array - >>> import shapely + >>> import shapely.geometry >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( - ... [shapely.Point(1, 2), shapely.Point(2, 3), shapely.Point(3, 4)], + ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], ... dtype=geopandas.array.GeometryDtype() ... ) >>> series.geo.x @@ -72,11 +72,11 @@ def y(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array - >>> import shapely + >>> import shapely.geometry >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( - ... [shapely.Point(1, 2), shapely.Point(2, 3), shapely.Point(3, 4)], + ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], ... dtype=geopandas.array.GeometryDtype() ... ) >>> series.geo.y @@ -101,7 +101,7 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import geopandas.array - >>> import shapely + >>> import shapely.geometry >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Polygon, LineString, Point @@ -120,7 +120,7 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> s.boundary 0 LINESTRING (0 0, 1 1, 0 1, 0 0) - 1 MULTIPOINT (0 0, 1 0) + 1 MULTIPOINT ((0 0), (1 0)) 2 GEOMETRYCOLLECTION EMPTY dtype: geometry diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index 71e5d9e3df..a87cb081cb 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -28,6 +28,7 @@ from bigframes_vendored.ibis.backends.sql.compilers import BigQueryCompiler from bigframes_vendored.ibis.backends.sql.datatypes import BigQueryType import bigframes_vendored.ibis.common.exceptions as com +import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.operations as ops import bigframes_vendored.ibis.expr.schema as sch import bigframes_vendored.ibis.expr.types as ir @@ -773,7 +774,7 @@ def execute(self, expr, params=None, limit="default", **kwargs): self._run_pre_execute_hooks(expr) schema = expr.as_table().schema() - bigframes_vendored.ibis.schema( - {"_TABLE_SUFFIX": "string"} + {"_TABLE_SUFFIX": ibis_dtypes.string()} ) sql = self.compile(expr, limit=limit, params=params, **kwargs) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index d1ab36c41a..6e98d6a9e1 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -706,6 +706,10 @@ def visit_Literal(self, op, *, value, dtype): else return the result of the previous step. """ if value is None: + if dtype.is_array(): + # hack: bq arrays are like semi-nullable, but want to treat as non-nullable for simplicity + # instead, use empty array as missing value sentinel + return self.cast(self.f.array(), dtype) if dtype.nullable: return NULL if dtype.is_null() else self.cast(NULL, dtype) raise ibis_exceptions.UnsupportedOperationError( @@ -763,8 +767,9 @@ def visit_DefaultLiteral(self, op, *, value, dtype): elif dtype.is_date(): return self.f.datefromparts(value.year, value.month, value.day) elif dtype.is_array(): + # array type is ambiguous if no elements value_type = dtype.value_type - return self.f.array( + values = self.f.array( *( self.visit_Literal( ops.Literal(v, value_type), value=v, dtype=value_type @@ -772,6 +777,7 @@ def visit_DefaultLiteral(self, op, *, value, dtype): for v in value ) ) + return values if len(value) > 0 else self.cast(values, dtype) elif dtype.is_map(): key_type = dtype.key_type keys = self.f.array( @@ -804,11 +810,11 @@ def visit_DefaultLiteral(self, op, *, value, dtype): return sge.Struct.from_arg_list(items) elif dtype.is_uuid(): return self.cast(str(value), dtype) + elif dtype.is_json(): + return sge.ParseJSON(this=sge.convert(str(value))) elif dtype.is_geospatial(): - args = [value.wkt] - if (srid := dtype.srid) is not None: - args.append(srid) - return self.f.st_geomfromtext(*args) + wkt = value if isinstance(value, str) else value.wkt + return self.f.st_geogfromtext(wkt) raise NotImplementedError(f"Unsupported type: {dtype!r}") diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 7d6cd6d2b4..7e001d1ac3 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -3,6 +3,7 @@ from __future__ import annotations +import datetime import decimal import math import re @@ -478,6 +479,11 @@ def visit_NonNullLiteral(self, op, *, value, dtype): return sge.convert(str(value)) elif dtype.is_int64(): + # allows directly using values out of a duration arrow array + if isinstance(value, datetime.timedelta): + value = ( + (value.days * 3600 * 24) + value.seconds + ) * 1_000_000 + value.microseconds return sge.convert(np.int64(value)) return None @@ -1024,7 +1030,7 @@ def visit_InMemoryTable(self, op, *, name, schema, data): # Avoid creating temp tables for small data, which is how memtable is # used in BigQuery DataFrames. Inspired by: # https://github.com/ibis-project/ibis/blob/efa6fb72bf4c790450d00a926d7bd809dade5902/ibis/backends/druid/compiler.py#L95 - tuples = data.to_frame().itertuples(index=False) + rows = data.to_pyarrow(schema=None).to_pylist() # type: ignore quoted = self.quoted columns = [sg.column(col, quoted=quoted) for col in schema.names] array_expr = sge.DataType( @@ -1042,10 +1048,10 @@ def visit_InMemoryTable(self, op, *, name, schema, data): sge.Struct( expressions=tuple( self.visit_Literal(None, value=value, dtype=type_) - for value, type_ in zip(row, schema.types) + for value, type_ in zip(row.values(), schema.types) ) ) - for row in tuples + for row in rows ] expr = sge.Unnest( expressions=[ diff --git a/third_party/bigframes_vendored/ibis/expr/api.py b/third_party/bigframes_vendored/ibis/expr/api.py index 8427ab1c4b..4ef10e449b 100644 --- a/third_party/bigframes_vendored/ibis/expr/api.py +++ b/third_party/bigframes_vendored/ibis/expr/api.py @@ -2369,7 +2369,7 @@ def ifelse(condition: Any, true_expr: Any, false_expr: Any) -> ir.Value: if not isinstance(condition, ir.Value): condition = literal(condition, type="bool") elif not condition.type().is_boolean(): - condition = condition.cast("bool") + condition = condition.cast(bool) return condition.ifelse(true_expr, false_expr) diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/__init__.py b/third_party/bigframes_vendored/ibis/expr/datatypes/__init__.py index e17050c865..2ff4d41ab5 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/__init__.py @@ -4,7 +4,6 @@ from bigframes_vendored.ibis.expr.datatypes.cast import * # noqa: F403 from bigframes_vendored.ibis.expr.datatypes.core import * # noqa: F403 -from bigframes_vendored.ibis.expr.datatypes.parse import * # noqa: F403 from bigframes_vendored.ibis.expr.datatypes.value import * # noqa: F403 halffloat = float16 # noqa: F405 diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/core.py b/third_party/bigframes_vendored/ibis/expr/datatypes/core.py index 73dd375563..eb597cfc6a 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/core.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/core.py @@ -167,15 +167,6 @@ def castable(self, to, **kwargs) -> bool: return castable(self, to, **kwargs) - @classmethod - def from_string(cls, value) -> Self: - from bigframes_vendored.ibis.expr.datatypes.parse import parse - - try: - return parse(value) - except SyntaxError: - raise TypeError(f"{value!r} cannot be parsed as a datatype") - @classmethod def from_typehint(cls, typ, nullable=True) -> Self: origin_type = get_origin(typ) diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/parse.py b/third_party/bigframes_vendored/ibis/expr/datatypes/parse.py deleted file mode 100644 index 78bbe0347c..0000000000 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/parse.py +++ /dev/null @@ -1,211 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/expr/datatypes/parse.py - -from __future__ import annotations - -import ast -import functools -from operator import methodcaller -import re - -import bigframes_vendored.ibis.expr.datatypes.core as dt -import parsy -from public import public - -_STRING_REGEX = ( - """('[^\n'\\\\]*(?:\\\\.[^\n'\\\\]*)*'|"[^\n"\\\\"]*(?:\\\\.[^\n"\\\\]*)*")""" -) - -SPACES = parsy.regex(r"\s*", re.MULTILINE) - - -def spaceless(parser): - return SPACES.then(parser).skip(SPACES) - - -def spaceless_string(*strings: str): - return spaceless( - parsy.alt(*(parsy.string(string, transform=str.lower) for string in strings)) - ) - - -SINGLE_DIGIT = parsy.decimal_digit -RAW_NUMBER = SINGLE_DIGIT.at_least(1).concat() -PRECISION = SCALE = NUMBER = LENGTH = RAW_NUMBER.map(int) -TEMPORAL_SCALE = SINGLE_DIGIT.map(int) - -LPAREN = spaceless_string("(") -RPAREN = spaceless_string(")") - -LBRACKET = spaceless_string("[") -RBRACKET = spaceless_string("]") - -LANGLE = spaceless_string("<") -RANGLE = spaceless_string(">") - -COMMA = spaceless_string(",") -COLON = spaceless_string(":") -SEMICOLON = spaceless_string(";") - -RAW_STRING = parsy.regex(_STRING_REGEX).map(ast.literal_eval) -FIELD = parsy.regex("[a-zA-Z_0-9]+") | parsy.string("") - - -@public -@functools.lru_cache(maxsize=100) -def parse( - text: str, default_decimal_parameters: tuple[int | None, int | None] = (None, None) -) -> dt.DataType: - """Parse a type from a [](`str`) `text`. - - The default `maxsize` parameter for caching is chosen to cache the most - commonly used types--there are about 30--along with some capacity for less - common but repeatedly-used complex types. - - Parameters - ---------- - text - The type string to parse - default_decimal_parameters - Default precision and scale for decimal types - - Examples - -------- - Parse an array type from a string - - >>> import ibis - >>> import ibis.expr.datatypes as dt - >>> dt.parse("array") - Array(value_type=Int64(nullable=True), nullable=True) - - You can avoid parsing altogether by constructing objects directly - - >>> import ibis - >>> import ibis.expr.datatypes as dt - >>> ty = dt.parse("array") - >>> ty == dt.Array(dt.int64) - True - - """ - geotype = spaceless_string("geography", "geometry") - - srid_geotype = SEMICOLON.then(parsy.seq(srid=NUMBER.skip(COLON), geotype=geotype)) - geotype_part = COLON.then(parsy.seq(geotype=geotype)) - srid_part = SEMICOLON.then(parsy.seq(srid=NUMBER)) - - def geotype_parser(typ: type[dt.DataType]) -> dt.DataType: - return spaceless_string(typ.__name__.lower()).then( - (srid_geotype | geotype_part | srid_part).optional(dict()).combine_dict(typ) - ) - - primitive = ( - spaceless_string("boolean", "bool").result(dt.boolean) - | spaceless_string("halffloat", "float16").result(dt.float16) - | spaceless_string("float32").result(dt.float32) - | spaceless_string("double", "float64", "float").result(dt.float64) - | spaceless_string( - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "string", - "binary", - "timestamp", - "time", - "date", - "null", - ).map(functools.partial(getattr, dt)) - | spaceless_string("bytes").result(dt.binary) - | geotype.map(dt.GeoSpatial) - | geotype_parser(dt.LineString) - | geotype_parser(dt.Polygon) - | geotype_parser(dt.Point) - | geotype_parser(dt.MultiLineString) - | geotype_parser(dt.MultiPolygon) - | geotype_parser(dt.MultiPoint) - ) - - varchar_or_char = ( - spaceless_string("varchar", "char") - .then(LPAREN.then(RAW_NUMBER).skip(RPAREN).optional()) - .result(dt.string) - ) - - decimal = spaceless_string("decimal").then( - parsy.seq( - LPAREN.then(spaceless(PRECISION)).skip(COMMA), spaceless(SCALE).skip(RPAREN) - ) - .optional(default_decimal_parameters) - .combine(dt.Decimal) - ) - - bignumeric = spaceless_string("bignumeric", "bigdecimal").then( - parsy.seq( - LPAREN.then(spaceless(PRECISION)).skip(COMMA), spaceless(SCALE).skip(RPAREN) - ) - .optional((76, 38)) - .combine(dt.Decimal) - ) - - parened_string = LPAREN.then(RAW_STRING).skip(RPAREN) - timestamp_scale = SINGLE_DIGIT.map(int) - - timestamp_tz_args = LPAREN.then( - parsy.seq(timezone=RAW_STRING, scale=COMMA.then(timestamp_scale).optional()) - ).skip(RPAREN) - - timestamp_no_tz_args = LPAREN.then(parsy.seq(scale=timestamp_scale).skip(RPAREN)) - - timestamp = spaceless_string("timestamp").then( - (timestamp_tz_args | timestamp_no_tz_args) - .optional({}) - .combine_dict(dt.Timestamp) - ) - - interval = spaceless_string("interval").then( - parsy.seq(unit=parened_string.optional("s")).combine_dict(dt.Interval) - ) - - ty = parsy.forward_declaration() - angle_type = LANGLE.then(ty).skip(RANGLE) - array = spaceless_string("array").then(angle_type).map(dt.Array) - - map = ( - spaceless_string("map") - .then(LANGLE) - .then(parsy.seq(ty, COMMA.then(ty)).combine(dt.Map)) - .skip(RANGLE) - ) - - struct = ( - spaceless_string("struct") - .then(LANGLE) - .then(parsy.seq(spaceless(FIELD).skip(COLON), ty).sep_by(COMMA)) - .skip(RANGLE) - .map(dt.Struct.from_tuples) - ) - - nullable = spaceless_string("!").then(ty).map(methodcaller("copy", nullable=False)) - - ty.become( - nullable - | timestamp - | primitive - | decimal - | bignumeric - | varchar_or_char - | interval - | array - | map - | struct - | spaceless_string("jsonb", "json", "uuid", "macaddr", "inet").map( - functools.partial(getattr, dt) - ) - | spaceless_string("int").result(dt.int64) - | spaceless_string("str").result(dt.string) - ) - - return ty.parse(text) diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py index f9302b63f4..e390cea02c 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py @@ -312,15 +312,16 @@ def normalize(typ, value): ) return frozendict({k: normalize(t, value[k]) for k, t in dtype.items()}) elif dtype.is_geospatial(): - import shapely as shp + import shapely + import shapely.geometry if isinstance(value, (tuple, list)): if dtype.is_point(): - return shp.Point(value) + return shapely.geometry.Point(value) elif dtype.is_linestring(): - return shp.LineString(value) + return shapely.geometry.LineString(value) elif dtype.is_polygon(): - return shp.Polygon( + return shapely.geometry.Polygon( toolz.concat( map( attrgetter("coords"), @@ -329,19 +330,23 @@ def normalize(typ, value): ) ) elif dtype.is_multipoint(): - return shp.MultiPoint(tuple(map(partial(normalize, dt.point), value))) + return shapely.geometry.MultiPoint( + tuple(map(partial(normalize, dt.point), value)) + ) elif dtype.is_multilinestring(): - return shp.MultiLineString( + return shapely.geometry.MultiLineString( tuple(map(partial(normalize, dt.linestring), value)) ) elif dtype.is_multipolygon(): - return shp.MultiPolygon(map(partial(normalize, dt.polygon), value)) + return shapely.geometry.MultiPolygon( + map(partial(normalize, dt.polygon), value) + ) else: raise IbisTypeError(f"Unsupported geospatial type: {dtype}") - elif isinstance(value, shp.geometry.base.BaseGeometry): + elif isinstance(value, shapely.geometry.base.BaseGeometry): return value else: - return shp.from_wkt(value) + return shapely.from_wkt(value) elif dtype.is_date(): return normalize_datetime(value).date() elif dtype.is_time(): diff --git a/third_party/bigframes_vendored/ibis/expr/types/arrays.py b/third_party/bigframes_vendored/ibis/expr/types/arrays.py index 5f86cfe477..a8f64490c1 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/arrays.py +++ b/third_party/bigframes_vendored/ibis/expr/types/arrays.py @@ -416,7 +416,7 @@ def map(self, func: Deferred | Callable[[ir.Value], ir.Value]) -> ir.ArrayValue: The most succinct way to use `map` is with `Deferred` expressions: - >>> t.a.map((_ + 100).cast("float")) + >>> t.a.map((_ + 100).cast(float)) ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ArrayMap(a, Cast(Add(_, 100), float64)) ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ @@ -429,7 +429,7 @@ def map(self, func: Deferred | Callable[[ir.Value], ir.Value]) -> ir.ArrayValue: You can also use `map` with a lambda function: - >>> t.a.map(lambda x: (x + 100).cast("float")) + >>> t.a.map(lambda x: (x + 100).cast(float)) ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ArrayMap(a, Cast(Add(x, 100), float64)) ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ diff --git a/third_party/bigframes_vendored/ibis/expr/types/generic.py b/third_party/bigframes_vendored/ibis/expr/types/generic.py index 607170e1ca..7de357b138 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/generic.py +++ b/third_party/bigframes_vendored/ibis/expr/types/generic.py @@ -179,31 +179,10 @@ def cast(self, target_type: Any) -> Value: │ … │ └────────────────────────────┘ - or string names - - >>> x.cast("uint16") - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ Cast(bill_depth_mm, uint16) ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ - │ uint16 │ - ├─────────────────────────────┤ - │ 19 │ - │ 17 │ - │ 18 │ - │ NULL │ - │ 19 │ - │ 21 │ - │ 18 │ - │ 20 │ - │ 18 │ - │ 20 │ - │ … │ - └─────────────────────────────┘ - If you make an illegal cast, you won't know until the backend actually executes it. Consider [`.try_cast()`](#ibis.expr.types.generic.Value.try_cast). - >>> ibis.literal("a string").cast("int64") # doctest: +SKIP + >>> ibis.literal("a string").cast(int) # doctest: +SKIP """ op = ops.Cast(self, to=target_type) diff --git a/third_party/bigframes_vendored/ibis/expr/types/geospatial.py b/third_party/bigframes_vendored/ibis/expr/types/geospatial.py index 3f42a4ad14..298e74d6de 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/geospatial.py +++ b/third_party/bigframes_vendored/ibis/expr/types/geospatial.py @@ -135,7 +135,7 @@ def contains(self, right: GeoSpatialValue) -> ir.BooleanValue: >>> ibis.options.interactive = True >>> import shapely >>> t = ibis.examples.zones.fetch() - >>> p = shapely.Point(935996.821, 191376.75) # centroid for zone 1 + >>> p = shapely.geometry.Point(935996.821, 191376.75) # centroid for zone 1 >>> plit = ibis.literal(p, "geometry") >>> t.geom.contains(plit).name("contains") ┏━━━━━━━━━━┓ @@ -197,7 +197,7 @@ def covers(self, right: GeoSpatialValue) -> ir.BooleanValue: Polygon area center in zone 1 - >>> z1_ctr_buff = shapely.Point(935996.821, 191376.75).buffer(10) + >>> z1_ctr_buff = shapely.geometry.Point(935996.821, 191376.75).buffer(10) >>> z1_ctr_buff_lit = ibis.literal(z1_ctr_buff, "geometry") >>> t.geom.covers(z1_ctr_buff_lit).name("covers") ┏━━━━━━━━━┓ @@ -242,7 +242,7 @@ def covered_by(self, right: GeoSpatialValue) -> ir.BooleanValue: Polygon area center in zone 1 - >>> pol_big = shapely.Point(935996.821, 191376.75).buffer(10000) + >>> pol_big = shapely.geometry.Point(935996.821, 191376.75).buffer(10000) >>> pol_big_lit = ibis.literal(pol_big, "geometry") >>> t.geom.covered_by(pol_big_lit).name("covered_by") ┏━━━━━━━━━━━━┓ @@ -262,7 +262,7 @@ def covered_by(self, right: GeoSpatialValue) -> ir.BooleanValue: │ False │ │ … │ └────────────┘ - >>> pol_small = shapely.Point(935996.821, 191376.75).buffer(100) + >>> pol_small = shapely.geometry.Point(935996.821, 191376.75).buffer(100) >>> pol_small_lit = ibis.literal(pol_small, "geometry") >>> t.geom.covered_by(pol_small_lit).name("covered_by") ┏━━━━━━━━━━━━┓ @@ -387,7 +387,7 @@ def disjoint(self, right: GeoSpatialValue) -> ir.BooleanValue: >>> ibis.options.interactive = True >>> import shapely >>> t = ibis.examples.zones.fetch() - >>> p = shapely.Point(935996.821, 191376.75) # zone 1 centroid + >>> p = shapely.geometry.Point(935996.821, 191376.75) # zone 1 centroid >>> plit = ibis.literal(p, "geometry") >>> t.geom.disjoint(plit).name("disjoint") ┏━━━━━━━━━━┓ @@ -435,7 +435,7 @@ def d_within( >>> ibis.options.interactive = True >>> import shapely >>> t = ibis.examples.zones.fetch() - >>> penn_station = shapely.Point(986345.399, 211974.446) + >>> penn_station = shapely.geometry.Point(986345.399, 211974.446) >>> penn_lit = ibis.literal(penn_station, "geometry") Check zones within 1000ft of Penn Station centroid @@ -578,7 +578,7 @@ def intersects(self, right: GeoSpatialValue) -> ir.BooleanValue: >>> ibis.options.interactive = True >>> import shapely >>> t = ibis.examples.zones.fetch() - >>> p = shapely.Point(935996.821, 191376.75) # zone 1 centroid + >>> p = shapely.geometry.Point(935996.821, 191376.75) # zone 1 centroid >>> plit = ibis.literal(p, "geometry") >>> t.geom.intersects(plit).name("intersects") ┏━━━━━━━━━━━━┓ @@ -675,7 +675,7 @@ def overlaps(self, right: GeoSpatialValue) -> ir.BooleanValue: Polygon center in an edge point of zone 1 - >>> p_edge_buffer = shapely.Point(933100.918, 192536.086).buffer(100) + >>> p_edge_buffer = shapely.geometry.Point(933100.918, 192536.086).buffer(100) >>> buff_lit = ibis.literal(p_edge_buffer, "geometry") >>> t.geom.overlaps(buff_lit).name("overlaps") ┏━━━━━━━━━━┓ @@ -720,7 +720,7 @@ def touches(self, right: GeoSpatialValue) -> ir.BooleanValue: Edge point of zone 1 - >>> p_edge = shapely.Point(933100.9183527103, 192536.08569720192) + >>> p_edge = shapely.geometry.Point(933100.9183527103, 192536.08569720192) >>> p_edge_lit = ibis.literal(p_edge, "geometry") >>> t.geom.touches(p_edge_lit).name("touches") ┏━━━━━━━━━┓ @@ -765,7 +765,7 @@ def distance(self, right: GeoSpatialValue) -> ir.FloatingValue: Penn station zone centroid - >>> penn_station = shapely.Point(986345.399, 211974.446) + >>> penn_station = shapely.geometry.Point(986345.399, 211974.446) >>> penn_lit = ibis.literal(penn_station, "geometry") >>> t.geom.distance(penn_lit).name("distance_penn") ┏━━━━━━━━━━━━━━━┓ @@ -886,7 +886,7 @@ def union(self, right: GeoSpatialValue) -> GeoSpatialValue: Penn station zone centroid - >>> penn_station = shapely.Point(986345.399, 211974.446) + >>> penn_station = shapely.geometry.Point(986345.399, 211974.446) >>> penn_lit = ibis.literal(penn_station, "geometry") >>> t.geom.centroid().union(penn_lit).name("union_centroid_penn") ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1312,7 +1312,7 @@ def within(self, right: GeoSpatialValue) -> ir.BooleanValue: >>> ibis.options.interactive = True >>> import shapely >>> t = ibis.examples.zones.fetch() - >>> penn_station_buff = shapely.Point(986345.399, 211974.446).buffer(5000) + >>> penn_station_buff = shapely.geometry.Point(986345.399, 211974.446).buffer(5000) >>> penn_lit = ibis.literal(penn_station_buff, "geometry") >>> t.filter(t.geom.within(penn_lit))["zone"] ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ diff --git a/third_party/bigframes_vendored/ibis/expr/types/json.py b/third_party/bigframes_vendored/ibis/expr/types/json.py index 388b4d8742..51d1642de0 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/json.py +++ b/third_party/bigframes_vendored/ibis/expr/types/json.py @@ -446,24 +446,6 @@ def str(self) -> ir.StringValue: │ NULL │ └──────────────────────┘ - Note the difference between `.string` and `.cast("string")`. - - The latter preserves quotes for JSON string values and returns a valid - JSON string. - - >>> t.js.cast("string") - ┏━━━━━━━━━━━━━━━━━━┓ - ┃ Cast(js, string) ┃ - ┡━━━━━━━━━━━━━━━━━━┩ - │ string │ - ├──────────────────┤ - │ "a" │ - │ "b" │ - │ 1 │ - │ {} │ - │ [{"a": 1}] │ - └──────────────────┘ - Here's a more complex example with a table containing a JSON column with nested fields. diff --git a/third_party/bigframes_vendored/ibis/expr/types/relations.py b/third_party/bigframes_vendored/ibis/expr/types/relations.py index 919dec0669..d3d66b1512 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/relations.py +++ b/third_party/bigframes_vendored/ibis/expr/types/relations.py @@ -3798,7 +3798,7 @@ def pivot_longer( ... names_pattern=r"wk(.+)", ... names_transform=int, ... values_to="rank", - ... values_transform=_.cast("int"), + ... values_transform=_.cast(int), ... ).drop_null("rank") ┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━┓ ┃ artist ┃ track ┃ date_entered ┃ week ┃ rank ┃ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e59232ee85..8f3e150606 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4433,7 +4433,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -4813,7 +4813,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] @@ -4828,7 +4828,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -4869,7 +4869,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 8dd43fd8da..4c9d1338f4 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1052,37 +1052,68 @@ def rolling( self, window, min_periods: int | None = None, + on: str | None = None, + closed: Literal["right", "left", "both", "neither"] = "right", ): """ Provide rolling window calculations. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0,1,2,3,4]) + >>> s.rolling(window=3).min() + 0 + 1 + 2 0 + 3 1 + 4 2 + dtype: Int64 + + >>> df = bpd.DataFrame({'A': [0,1,2,3], 'B': [0,2,4,6]}) + >>> df.rolling(window=2, on='A', closed='both').sum() + A B + 0 0 + 1 1 2 + 2 2 6 + 3 3 12 + + [4 rows x 2 columns] + Args: - window (int, timedelta, str, offset, or BaseIndexer subclass): + window (int, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, str): Size of the moving window. If an integer, the fixed number of observations used for each window. - If a timedelta, str, or offset, the time period of each window. Each - window will be a variable sized based on the observations included in - the time-period. This is only valid for datetime-like indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. + If a string, the timedelta representation in string. This string + must be parsable by pandas.Timedelta(). - If a BaseIndexer subclass, the window boundaries - based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely ``min_periods``, ``center``, ``closed`` and - ``step`` will be passed to ``get_window_bounds``. + Otherwise, the time range for each window. min_periods (int, default None): Minimum number of observations in window required to have a value; otherwise, result is ``np.nan``. - For a window that is specified by an offset, ``min_periods`` will default to 1. - For a window that is specified by an integer, ``min_periods`` will default to the size of the window. + For a window that is not spicified by an interger, ``min_periods`` will default + to 1. + + on (str, optional): + For a DataFrame, a column label on which to calculate the rolling window, + rather than the DataFrame’s index. + + closed (str, default 'right'): + If 'right', the first point in the window is excluded from calculations. + If 'left', the last point in the window is excluded from calculations. + If 'both', the no points in the window are excluded from calculations. + If 'neither', the first and last points in the window are excluded from calculations. + Returns: bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed. ``Rolling`` subclass if ``win_type`` is not passed. diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 31a9aa6a93..4fb8498932 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -1025,16 +1025,37 @@ def rolling(self, *args, **kwargs): dtype: Int64 Args: + window (int, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, str): + Size of the moving window. + + If an integer, the fixed number of observations used for + each window. + + If a string, the timedelta representation in string. This string + must be parsable by pandas.Timedelta(). + + Otherwise, the time range for each window. + min_periods (int, default None): Minimum number of observations in window required to have a value; otherwise, result is ``np.nan``. - For a window that is specified by an offset, - ``min_periods`` will default to 1. - For a window that is specified by an integer, ``min_periods`` will default to the size of the window. + For a window that is not spicified by an interger, ``min_periods`` will default + to 1. + + on (str, optional): + For a DataFrame, a column label on which to calculate the rolling window, + rather than the DataFrame’s index. + + closed (str, default 'right'): + If 'right', the first point in the window is excluded from calculations. + If 'left', the last point in the window is excluded from calculations. + If 'both', the no points in the window are excluded from calculations. + If 'neither', the first and last points in the window are excluded from calculations. + Returns: bigframes.pandas.DataFrame or bigframes.pandas.Series: Return a new grouper with our rolling appended. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index d911a303eb..fccaffdadf 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -31,8 +31,6 @@ def cut( age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins. - ``labels=False`` implies you just want the bins back. - **Examples:** >>> import bigframes.pandas as bpd @@ -55,7 +53,16 @@ def cut( 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} dtype: struct[pyarrow] - Cut with an integer (equal-width bins) and labels=False: + Cut with the same bins, but assign them specific labels: + + >>> bpd.cut(s, bins=3, labels=["bad", "medium", "good"]) + 0 bad + 1 bad + 2 medium + 3 good + dtype: string + + `labels=False` implies you want the bins back. >>> bpd.cut(s, bins=4, labels=False) 0 0 @@ -67,7 +74,6 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: >>> import pandas as pd - >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 @@ -107,7 +113,7 @@ def cut( dtype: struct[pyarrow] Args: - x (Series): + x (bigframes.pandas.Series): The input Series to be binned. Must be 1-dimensional. bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. @@ -127,10 +133,11 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels (default None): + labels (bool, Iterable, default None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. This affects the type of the output container. + bins. This affects the type of the output container. This argument is + ignored when `bins` is an IntervalIndex. If True, raises an error. Returns: bigframes.pandas.Series: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 913a2e7c3e..a2d0983652 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1854,7 +1854,7 @@ def apply( to potentially reuse a previously deployed `remote_function` from the same user defined function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1883,6 +1883,7 @@ def apply( >>> @bpd.remote_function( ... reuse=False, ... packages=["cryptography"], + ... cloud_function_service_account="default" ... ) ... def get_hash(input: str) -> str: ... from cryptography.fernet import Fernet @@ -1900,7 +1901,7 @@ def apply( You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -5069,7 +5070,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -5665,7 +5666,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function() + >>> @bpd.remote_function(cloud_function_service_account="default") ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py new file mode 100644 index 0000000000..fb29cc8984 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -0,0 +1,95 @@ +""" Matrix Factorization. +""" + +# Author: Alexandre Gramfort +# Olivier Grisel +# Mathieu Blondel +# Denis A. Engemann +# Michael Eickenberg +# Giorgio Patrini +# +# License: BSD 3 clause + +from abc import ABCMeta + +from bigframes_vendored.sklearn.base import BaseEstimator + +from bigframes import constants + + +class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): + """Matrix Factorization (MF). + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> from bigframes.ml.decomposition import MatrixFactorization + >>> bpd.options.display.progress_bar = None + >>> X = bpd.DataFrame({ + ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], + ... "column": [0,1] * 7, + ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 3], + ... }) + >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) + >>> W = model.fit(X) + + Args: + feedback_type ('explicit' | 'implicit'): + Specifies the feedback type for the model. The feedback type determines the algorithm that is used during training. + num_factors (int or auto, default auto): + Specifies the number of latent factors to use. + user_col (str): + The user column name. + item_col (str): + The item column name. + l2_reg (float, default 1.0): + A floating point value for L2 regularization. The default value is 1.0. + """ + + def fit(self, X, y=None): + """Fit the model according to the given training data. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Training vector, + where `n_samples` is the number of samples and `n_features` is + the number of features. + + y (default None): + Ignored. + + Returns: + bigframes.ml.decomposition.MatrixFactorization: Fitted estimator. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def score(self, X=None, y=None): + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUATE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#matrix_factorization_models + for the outputs relevant to this model type. + + Args: + X (default None): + Ignored. + + y (default None): + Ignored. + Returns: + bigframes.dataframe.DataFrame: DataFrame that represents model metrics. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def predict(self, X): + """Generate a predicted rating for every user-item row combination for a matrix factorization model. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or a DataFrame to predict. + + Returns: + bigframes.dataframe.DataFrame: Predicted DataFrames.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index 365aa12eb9..9d868f3343 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -43,4 +43,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result_df = result_df.sort_values(by="VALUE", ascending=False) - next(result_df.to_pandas_batches(max_results=1500)) + result_df.to_pandas() diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 356e73a71d..e3a1d84bfa 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.42.0" +__version__ = "2.0.0" # {x-release-please-start-date} -__release_date__ = "2025-03-27" +__release_date__ = "2025-04-17" # {x-release-please-end}

I'm... banashark \n", - "322 Default search scope is an option in the Finde... kitsunesoba \n", - "391 Orthogonality and biology aren't friends. agumonkey \n", - "396 I chose some random physics book that was good... prawn \n", - "424 Seeing this get huge on Twitter. It's the... shenanigoat \n", - "428 Looking through the comments there are a numbe... moomin \n", - "429 Legacy media is a tough business. GBTC is payi... arcticbull \n", - "436 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", - "438 There was briefly a thing called HSCSD ("... LeoPanthera \n", - "446 > This article is a bit comical to read and... lapcat \n", - "453 Large positions are most likely sold off in sm... meowkit \n", - "507 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", - "543 2011-04-06 08:02:24+00:00 comment \n", - "137 2 2024-06-06 16:42:40+00:00 story \n", - "188 2015-06-18 16:42:53+00:00 comment \n", - "209 2015-06-08 02:15:30+00:00 comment \n", - "228 2019-02-03 14:35:43+00:00 comment \n", - "290 2016-07-16 06:19:39+00:00 comment \n", - "303 2008-12-17 04:42:02+00:00 comment \n", - "312 2023-11-13 19:57:00+00:00 comment \n", - "322 2017-08-13 17:15:19+00:00 comment \n", - "391 2016-04-24 16:33:41+00:00 comment \n", - "396 2011-03-27 22:29:51+00:00 comment \n", - "424 2016-01-09 03:04:22+00:00 comment \n", - "428 2024-10-01 14:37:04+00:00 comment \n", - "429 2021-04-16 16:30:33+00:00 comment \n", - "436 2023-08-03 20:47:52+00:00 comment \n", - "438 2019-02-11 19:49:29+00:00 comment \n", - "446 2023-01-02 16:00:49+00:00 comment \n", - "453 2021-01-27 23:22:48+00:00 comment \n", - "507 2019-04-05 00:58:58+00:00 comment \n", - "543 2023-12-13 08:13:15+00:00 comment \n", - "565 2024-05-03 11:58:13+00:00 comment \n", - "612 2021-03-05 16:07:56+00:00 comment \n", - "660 2022-06-09 09:51:54+00:00 comment \n", - "673 2013-08-12 00:31:02+00:00 comment \n", - "...\n", - "\n", - "[123 rows x 6 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news.semantics.filter(\"{by} contains animal name\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3bpkaspoiouZ" - }, - "source": [ - "Here are the runtime numbers with 500 requests per minute [raised quota](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas):\n", - "* 3000 rows -> ~6m\n", - "* 10000 rows -> ~26m" + "The tutorial notebook for AI operators is located [here](https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/experimental/ai_operators.ipynb)." ] } ], @@ -3206,7 +53,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 88633f8635..788111cfe6 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -914,8 +914,8 @@ }, "outputs": [], "source": [ - "@bf.remote_function([str], str)\n", - "def extract_code(text: str):\n", + "@bf.remote_function(cloud_function_service_account=\"default\")\n", + "def extract_code(text: str) -> str:\n", " try:\n", " res = text[text.find('\\n')+1:text.find('```', 3)]\n", " res = res.replace(\"import pandas as pd\", \"import bigframes.pandas as bf\")\n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb index d458a0f53b..1a9b568897 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb @@ -369,7 +369,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index c5deeef1c5..a8158bcb85 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -1485,8 +1485,8 @@ }, "outputs": [], "source": [ - "@bpd.remote_function([float], str)\n", - "def get_bucket(num):\n", + "@bpd.remote_function(cloud_function_service_account=\"default\")\n", + "def get_bucket(num: float) -> str:\n", " if not num: return \"NA\"\n", " boundary = 4000\n", " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index 1b138c6a66..066cd18136 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -1475,8 +1475,8 @@ } ], "source": [ - "@bpd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", - "def get_bucket(num):\n", + "@bpd.remote_function(bigquery_connection='bigframes-rf-conn', cloud_function_service_account=\"default\")\n", + "def get_bucket(num: float) -> str:\n", " if not num: return \"NA\"\n", " boundary = 4000\n", " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" diff --git a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb index 4bfdcc24aa..501bfc88d3 100644 --- a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb +++ b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb @@ -27,21 +27,25 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2024-10-01 22:44:50.650768+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n" - ] + "data": { + "text/html": [ + "Query job aa2b9845-0e66-4f42-a360-ffe03215caf6 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ - "Query job 4c2f2252-687a-47c3-87ad-22db8ad96e2b is DONE. 0 Bytes processed. Open Job" + "Query job fe2bc354-672e-4d08-b969-bb2ede299fca is DONE. 28.9 kB processed. Open Job" ], "text/plain": [ "" @@ -53,7 +57,7 @@ { "data": { "text/html": [ - "Query job a05c7268-8db2-468b-9fb4-0fb5c9534f51 is DONE. 0 Bytes processed. Open Job" + "Query job 8d16fa20-391f-4917-86fc-1a595dba3fc6 is DONE. 33.6 kB processed. Open Job" ], "text/plain": [ "" @@ -97,149 +101,317 @@ " 0\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 50.5\n", - " 15.9\n", - " 225.0\n", - " 5400.0\n", + " 45.2\n", + " 16.4\n", + " 223.0\n", + " 5950.0\n", " MALE\n", " \n", " \n", " 1\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 45.1\n", + " 46.5\n", " 14.5\n", - " 215.0\n", - " 5000.0\n", + " 213.0\n", + " 4400.0\n", " FEMALE\n", " \n", " \n", " 2\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Torgersen\n", - " 41.4\n", - " 18.5\n", - " 202.0\n", - " 3875.0\n", - " MALE\n", + " Biscoe\n", + " 37.7\n", + " 16.0\n", + " 183.0\n", + " 3075.0\n", + " FEMALE\n", " \n", " \n", " 3\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Torgersen\n", - " 38.6\n", - " 17.0\n", - " 188.0\n", - " 2900.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.4\n", + " 15.6\n", + " 221.0\n", + " 5000.0\n", + " MALE\n", " \n", " \n", " 4\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 46.5\n", - " 14.8\n", - " 217.0\n", - " 5200.0\n", + " 46.1\n", + " 13.2\n", + " 211.0\n", + " 4500.0\n", " FEMALE\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " 5\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 43.1\n", + " 19.2\n", + " 197.0\n", + " 3500.0\n", + " MALE\n", " \n", " \n", - " 339\n", + " 6\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.2\n", + " 15.8\n", + " 215.0\n", + " 5300.0\n", + " MALE\n", + " \n", + " \n", + " 7\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 38.1\n", - " 17.6\n", + " 36.2\n", + " 17.3\n", " 187.0\n", - " 3425.0\n", + " 3300.0\n", " FEMALE\n", " \n", " \n", - " 340\n", + " 8\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 46.0\n", + " 18.9\n", + " 195.0\n", + " 4150.0\n", + " FEMALE\n", + " \n", + " \n", + " 9\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 54.3\n", + " 15.7\n", + " 231.0\n", + " 5650.0\n", + " MALE\n", + " \n", + " \n", + " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 39.5\n", + " 17.4\n", + " 186.0\n", + " 3800.0\n", + " FEMALE\n", + " \n", + " \n", + " 12\n", + " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 36.4\n", - " 17.1\n", - " 184.0\n", - " 2850.0\n", + " 42.7\n", + " 13.7\n", + " 208.0\n", + " 3950.0\n", " FEMALE\n", " \n", " \n", - " 341\n", + " 13\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 41.0\n", + " 20.0\n", + " 203.0\n", + " 4725.0\n", + " MALE\n", + " \n", + " \n", + " 14\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.5\n", + " 15.0\n", + " 219.0\n", + " 4850.0\n", + " FEMALE\n", + " \n", + " \n", + " 15\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 40.9\n", - " 16.6\n", - " 187.0\n", - " 3200.0\n", + " 49.6\n", + " 18.2\n", + " 193.0\n", + " 3775.0\n", + " MALE\n", + " \n", + " \n", + " 16\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.8\n", + " 17.3\n", + " 228.0\n", + " 5600.0\n", + " MALE\n", + " \n", + " \n", + " 17\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.2\n", + " 14.1\n", + " 217.0\n", + " 4375.0\n", " FEMALE\n", " \n", " \n", - " 342\n", + " 18\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 41.3\n", - " 21.1\n", - " 195.0\n", - " 4400.0\n", + " 38.8\n", + " 17.2\n", + " 180.0\n", + " 3800.0\n", " MALE\n", " \n", " \n", - " 343\n", + " 19\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 45.2\n", - " 16.6\n", - " 191.0\n", - " 3250.0\n", + " 51.0\n", + " 18.8\n", + " 203.0\n", + " 4100.0\n", + " MALE\n", + " \n", + " \n", + " 20\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 42.9\n", + " 13.1\n", + " 215.0\n", + " 5000.0\n", + " FEMALE\n", + " \n", + " \n", + " 21\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.4\n", + " 15.3\n", + " 224.0\n", + " 5550.0\n", + " MALE\n", + " \n", + " \n", + " 22\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 49.0\n", + " 16.1\n", + " 216.0\n", + " 5550.0\n", + " MALE\n", + " \n", + " \n", + " 23\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 43.4\n", + " 14.4\n", + " 218.0\n", + " 4600.0\n", + " FEMALE\n", + " \n", + " \n", + " 24\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.0\n", + " 15.4\n", + " 220.0\n", + " 5050.0\n", + " MALE\n", + " \n", + " \n", + " 25\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 47.5\n", + " 14.0\n", + " 212.0\n", + " 4875.0\n", " FEMALE\n", " \n", " \n", "\n", - "