diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e336d3da3..54637f2f06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,19 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.29.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.28.0...v1.29.0) (2024-12-12) + + +### Features + +* Add Gemini 2.0 preview text model support ([#1209](https://github.com/googleapis/python-bigquery-dataframes/issues/1209)) ([1021d57](https://github.com/googleapis/python-bigquery-dataframes/commit/1021d5761a291f2327fc10216e938826e53dbcc4)) + + +### Documentation + +* Add Gemini 2.0 text gen sample notebook ([#1211](https://github.com/googleapis/python-bigquery-dataframes/issues/1211)) ([9596b66](https://github.com/googleapis/python-bigquery-dataframes/commit/9596b66a8a41f5e5db6fa5f87b01c5363ffa89c4)) +* Update bigframes.pandas.index docs return types ([#1191](https://github.com/googleapis/python-bigquery-dataframes/issues/1191)) ([c63e7da](https://github.com/googleapis/python-bigquery-dataframes/commit/c63e7dad6fe67f5769ddcdd1730666580a7e7a05)) + ## [1.28.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.27.0...v1.28.0) (2024-12-11) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 3b1bf48558..ac79ec8625 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -446,7 +446,7 @@ def try_row_join( other_node, r_mapping = self.prepare_join_names(other) import bigframes.core.rewrite - result_node = bigframes.core.rewrite.try_join_as_projection( + result_node = bigframes.core.rewrite.try_row_join( self.node, other_node, conditions ) if result_node is None: diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 420348cca9..1dd780e070 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -83,10 +83,6 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: """Direct children of this node""" return tuple([]) - @property - def projection_base(self) -> BigFrameNode: - return self - @property @abc.abstractmethod def row_count(self) -> typing.Optional[int]: @@ -918,10 +914,6 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.col_id,) - @property - def projection_base(self) -> BigFrameNode: - return self.child.projection_base - @property def added_fields(self) -> Tuple[Field, ...]: return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) @@ -1095,10 +1087,6 @@ def variables_introduced(self) -> int: def defines_namespace(self) -> bool: return True - @property - def projection_base(self) -> BigFrameNode: - return self.child.projection_base - @property def row_count(self) -> Optional[int]: return self.child.row_count @@ -1173,10 +1161,6 @@ def variables_introduced(self) -> int: def row_count(self) -> Optional[int]: return self.child.row_count - @property - def projection_base(self) -> BigFrameNode: - return self.child.projection_base - @property def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.assignments) @@ -1361,10 +1345,6 @@ def fields(self) -> Iterable[Field]: def variables_introduced(self) -> int: return 1 - @property - def projection_base(self) -> BigFrameNode: - return self.child.projection_base - @property def added_fields(self) -> Tuple[Field, ...]: return (self.added_field,) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index f7ee3c87c2..854ef2b464 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -13,13 +13,13 @@ # limitations under the License. from bigframes.core.rewrite.identifiers import remap_variables -from bigframes.core.rewrite.implicit_align import try_join_as_projection +from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection from bigframes.core.rewrite.slices import pullup_limit_from_slice, replace_slice_ops __all__ = [ "legacy_join_as_projection", - "try_join_as_projection", + "try_row_join", "replace_slice_ops", "pullup_limit_from_slice", "remap_variables", diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py index 1d7fed09d2..41cc1ce82a 100644 --- a/bigframes/core/rewrite/implicit_align.py +++ b/bigframes/core/rewrite/implicit_align.py @@ -14,7 +14,7 @@ from __future__ import annotations import dataclasses -from typing import Optional, Tuple +from typing import Iterable, Optional, Tuple import bigframes.core.expression import bigframes.core.guid @@ -30,6 +30,11 @@ bigframes.core.nodes.WindowOpNode, bigframes.core.nodes.PromoteOffsetsNode, ) +# Combination of selects and additive nodes can be merged as an explicit keyless "row join" +ALIGNABLE_NODES = ( + *ADDITIVE_NODES, + bigframes.core.nodes.SelectionNode, +) @dataclasses.dataclass(frozen=True) @@ -70,49 +75,29 @@ def get_expression_spec( bigframes.core.nodes.PromoteOffsetsNode, ), ): - # we don't yet have a way of normalizing window ops into a ExpressionSpec, which only - # handles normalizing scalar expressions at the moment. - pass + if set(expression.column_references).isdisjoint( + field.id for field in curr_node.added_fields + ): + # we don't yet have a way of normalizing window ops into a ExpressionSpec, which only + # handles normalizing scalar expressions at the moment. + pass + else: + return ExpressionSpec(expression, curr_node) else: return ExpressionSpec(expression, curr_node) curr_node = curr_node.child -def _linearize_trees( - base_tree: bigframes.core.nodes.BigFrameNode, - append_tree: bigframes.core.nodes.BigFrameNode, -) -> bigframes.core.nodes.BigFrameNode: - """Linearize two divergent tree who only diverge through different additive nodes.""" - assert append_tree.projection_base == base_tree.projection_base - # base case: append tree does not have any additive nodes to linearize - if append_tree == append_tree.projection_base: - return base_tree - else: - assert isinstance(append_tree, ADDITIVE_NODES) - return append_tree.replace_child(_linearize_trees(base_tree, append_tree.child)) - - -def combine_nodes( - l_node: bigframes.core.nodes.BigFrameNode, - r_node: bigframes.core.nodes.BigFrameNode, -) -> bigframes.core.nodes.BigFrameNode: - assert l_node.projection_base == r_node.projection_base - l_node, l_selection = pull_up_selection(l_node) - r_node, r_selection = pull_up_selection( - r_node, rename_vars=True - ) # Rename only right vars to avoid collisions with left vars - combined_selection = (*l_selection, *r_selection) - merged_node = _linearize_trees(l_node, r_node) - return bigframes.core.nodes.SelectionNode(merged_node, combined_selection) - - -def try_join_as_projection( +def try_row_join( l_node: bigframes.core.nodes.BigFrameNode, r_node: bigframes.core.nodes.BigFrameNode, join_keys: Tuple[Tuple[str, str], ...], ) -> Optional[bigframes.core.nodes.BigFrameNode]: """Joins the two nodes""" - if l_node.projection_base != r_node.projection_base: + divergent_node = first_shared_descendent( + l_node, r_node, descendable_types=ALIGNABLE_NODES + ) + if divergent_node is None: return None # check join keys are equivalent by normalizing the expressions as much as posisble # instead of just comparing ids @@ -124,11 +109,35 @@ def try_join_as_projection( r_node, right_id ): return None - return combine_nodes(l_node, r_node) + + l_node, l_selection = pull_up_selection(l_node, stop=divergent_node) + r_node, r_selection = pull_up_selection( + r_node, stop=divergent_node, rename_vars=True + ) # Rename only right vars to avoid collisions with left vars + combined_selection = (*l_selection, *r_selection) + + def _linearize_trees( + base_tree: bigframes.core.nodes.BigFrameNode, + append_tree: bigframes.core.nodes.BigFrameNode, + ) -> bigframes.core.nodes.BigFrameNode: + """Linearize two divergent tree who only diverge through different additive nodes.""" + # base case: append tree does not have any divergent nodes to linearize + if append_tree == divergent_node: + return base_tree + else: + assert isinstance(append_tree, ADDITIVE_NODES) + return append_tree.replace_child( + _linearize_trees(base_tree, append_tree.child) + ) + + merged_node = _linearize_trees(l_node, r_node) + return bigframes.core.nodes.SelectionNode(merged_node, combined_selection) def pull_up_selection( - node: bigframes.core.nodes.BigFrameNode, rename_vars: bool = False + node: bigframes.core.nodes.BigFrameNode, + stop: bigframes.core.nodes.BigFrameNode, + rename_vars: bool = False, ) -> Tuple[ bigframes.core.nodes.BigFrameNode, Tuple[ @@ -147,14 +156,14 @@ def pull_up_selection( Returns: BigFrameNode, Selections """ - if node == node.projection_base: # base case + if node == stop: # base case return node, tuple( (bigframes.core.expression.DerefOp(field.id), field.id) for field in node.fields ) assert isinstance(node, (bigframes.core.nodes.SelectionNode, *ADDITIVE_NODES)) child_node, child_selections = pull_up_selection( - node.child, rename_vars=rename_vars + node.child, stop, rename_vars=rename_vars ) mapping = {out: ref.id for ref, out in child_selections} if isinstance(node, ADDITIVE_NODES): @@ -188,3 +197,30 @@ def pull_up_selection( ) return child_node, new_selection raise ValueError(f"Couldn't pull up select from node: {node}") + + +## Traversal helpers +def first_shared_descendent( + left: bigframes.core.nodes.BigFrameNode, + right: bigframes.core.nodes.BigFrameNode, + descendable_types: Tuple[type[bigframes.core.nodes.UnaryNode], ...], +) -> Optional[bigframes.core.nodes.BigFrameNode]: + l_path = tuple(descend(left, descendable_types)) + r_path = tuple(descend(right, descendable_types)) + if l_path[-1] != r_path[-1]: + return None + + for l_node, r_node in zip(l_path[-len(r_path) :], r_path[-len(l_path) :]): + if l_node == r_node: + return l_node + # should be impossible, as l_path[-1] == r_path[-1] + raise ValueError() + + +def descend( + root: bigframes.core.nodes.BigFrameNode, + descendable_types: Tuple[type[bigframes.core.nodes.UnaryNode], ...], +) -> Iterable[bigframes.core.nodes.BigFrameNode]: + yield root + if isinstance(root, descendable_types): + yield from descend(root.child, descendable_types) diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py index 77ae9b3bb4..a671f34bd4 100644 --- a/bigframes/core/rewrite/legacy_align.py +++ b/bigframes/core/rewrite/legacy_align.py @@ -23,16 +23,17 @@ import bigframes.core.join_def as join_defs import bigframes.core.nodes as nodes import bigframes.core.ordering as order +import bigframes.core.rewrite.implicit_align import bigframes.operations as ops Selection = Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...] -REWRITABLE_NODE_TYPES = ( - nodes.SelectionNode, - nodes.ProjectionNode, - nodes.FilterNode, - nodes.ReversedNode, - nodes.OrderByNode, +LEGACY_REWRITER_NODES = ( + bigframes.core.nodes.ProjectionNode, + bigframes.core.nodes.SelectionNode, + bigframes.core.nodes.ReversedNode, + bigframes.core.nodes.OrderByNode, + bigframes.core.nodes.FilterNode, ) @@ -51,9 +52,7 @@ def from_node_span( cls, node: nodes.BigFrameNode, target: nodes.BigFrameNode ) -> SquashedSelect: if node == target: - selection = tuple( - (scalar_exprs.DerefOp(id), id) for id in get_node_column_ids(node) - ) + selection = tuple((scalar_exprs.DerefOp(id), id) for id in node.ids) return cls(node, selection, None, ()) if isinstance(node, nodes.SelectionNode): @@ -357,27 +356,10 @@ def decompose_conjunction( return (expr,) -def get_node_column_ids(node: nodes.BigFrameNode) -> Tuple[ids.ColumnId, ...]: - return tuple(field.id for field in node.fields) - - def common_selection_root( l_tree: nodes.BigFrameNode, r_tree: nodes.BigFrameNode ) -> Optional[nodes.BigFrameNode]: """Find common subtree between join subtrees""" - l_node = l_tree - l_nodes: set[nodes.BigFrameNode] = set() - while isinstance(l_node, REWRITABLE_NODE_TYPES): - l_nodes.add(l_node) - l_node = l_node.child - l_nodes.add(l_node) - - r_node = r_tree - while isinstance(r_node, REWRITABLE_NODE_TYPES): - if r_node in l_nodes: - return r_node - r_node = r_node.child - - if r_node in l_nodes: - return r_node - return None + return bigframes.core.rewrite.implicit_align.first_shared_descendent( + l_tree, r_tree, descendable_types=LEGACY_REWRITER_NODES + ) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 427e99583d..dca7e555f6 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -63,6 +63,7 @@ _GEMINI_1P5_PRO_002_ENDPOINT = "gemini-1.5-pro-002" _GEMINI_1P5_FLASH_001_ENDPOINT = "gemini-1.5-flash-001" _GEMINI_1P5_FLASH_002_ENDPOINT = "gemini-1.5-flash-002" +_GEMINI_2_FLASH_EXP_ENDPOINT = "gemini-2.0-flash-exp" _GEMINI_ENDPOINTS = ( _GEMINI_PRO_ENDPOINT, _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, @@ -71,6 +72,12 @@ _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, + _GEMINI_2_FLASH_EXP_ENDPOINT, +) +_GEMINI_PREVIEW_ENDPOINTS = ( + _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, + _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT, + _GEMINI_2_FLASH_EXP_ENDPOINT, ) _CLAUDE_3_SONNET_ENDPOINT = "claude-3-sonnet" @@ -757,10 +764,10 @@ class GeminiTextGenerator(base.BaseEstimator): Args: model_name (str, Default to "gemini-pro"): - The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001" and "gemini-1.5-flash-002". Default to "gemini-pro". + The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002" and "gemini-2.0-flash-exp". Default to "gemini-pro". .. note:: - "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + "gemini-2.0-flash-exp", "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions (https://cloud.google.com/products#product-launch-stages). @@ -786,11 +793,20 @@ def __init__( "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", ] = "gemini-pro", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, max_iterations: int = 300, ): + if model_name in _GEMINI_PREVIEW_ENDPOINTS: + warnings.warn( + f"""Model {model_name} is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages).""", + category=exceptions.PreviewWarning, + ) self.model_name = model_name self.session = session or bpd.get_global_session() self.max_iterations = max_iterations diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 6a14fb3451..1cf8dc8a53 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -67,6 +67,7 @@ llm._GEMINI_1P5_PRO_002_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_FLASH_001_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_FLASH_002_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_2_FLASH_EXP_ENDPOINT: llm.GeminiTextGenerator, llm._CLAUDE_3_HAIKU_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator, diff --git a/bigframes/version.py b/bigframes/version.py index 19a26f60f3..36b3b75c36 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.28.0" +__version__ = "1.29.0" diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 6739c299d2..9dc79afa5d 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -1,5 +1,26 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -17,6 +38,8 @@ "\n", "The second section talks about applying semantic operators on real-world large datasets. The examples are designed to benchmark the performance of the operators, and to (maybe) spark some ideas for your next application scenarios.\n", "\n", + "You can open this notebook on Google Colab [here](https://colab.research.google.com/github/googleapis/python-bigquery-dataframes/blob/main/notebooks/experimental/semantic_operators.ipynb).\n", + "\n", "Without further ado, let's get started." ] }, @@ -3893,7 +3916,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb new file mode 100644 index 0000000000..d458a0f53b --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb @@ -0,0 +1,377 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigFrames Gemini 2.0 Text Generation Simple Example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: This feature is only available in bigframes >= 1.29.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import packages" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "from bigframes.ml import llm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Gemini 2.0 experimental Model with model_name as \"gemini-2.0-flash-exp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/ml/llm.py:803: PreviewWarning: Model gemini-2.0-flash-exp is subject to the \"Pre-GA Offerings Terms\" in the General Service Terms section of the\n", + " Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available \"as is\"\n", + " and might have limited support. For more information, see the launch stage descriptions\n", + " (https://cloud.google.com/products#product-launch-stages).\n", + " warnings.warn(\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/pandas/__init__.py:435: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return global_session.get_global_session()\n" + ] + }, + { + "data": { + "text/html": [ + "Query job f673a2ea-023e-4771-84a2-fb81f808fa1b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model = llm.GeminiTextGenerator(model_name=\"gemini-2.0-flash-exp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a simple DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2276ea5b-2e08-4ed6-af34-49a7d165d145 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prompt
0Tell me something about Gemini 2.0.
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " prompt\n", + "0 Tell me something about Gemini 2.0.\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\"prompt\": [\"Tell me something about Gemini 2.0.\"]})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9ba21e96-6023-491e-8e83-f2e6fa7df0e7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/__init__.py:109: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 933d45cc-4bc0-4bdf-b4b8-573da2d58be3 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 3dda9bc6-84b1-4f4a-8891-85d25d8848ce is DONE. 4.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultml_generate_text_rai_resultml_generate_text_statusprompt
0Alright, let's talk about Gemini 2.0! It's a b...<NA>Tell me something about Gemini 2.0.
\n", + "

1 rows × 4 columns

\n", + "
[1 rows x 4 columns in total]" + ], + "text/plain": [ + " ml_generate_text_llm_result \\\n", + "0 Alright, let's talk about Gemini 2.0! It's a b... \n", + "\n", + " ml_generate_text_rai_result ml_generate_text_status \\\n", + "0 \n", + "\n", + " prompt \n", + "0 Tell me something about Gemini 2.0. \n", + "\n", + "[1 rows x 4 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = model.predict(df)\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save the model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Copy job 8e68af62-e7ab-475b-99c9-b79e8ba3c40b is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/ml/llm.py:803: PreviewWarning: Model gemini-2.0-flash-exp is subject to the \"Pre-GA Offerings Terms\" in the General Service Terms section of the\n", + " Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available \"as is\"\n", + " and might have limited support. For more information, see the launch stage descriptions\n", + " (https://cloud.google.com/products#product-launch-stages).\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job cae7f929-d8cb-4819-a644-ac832cdc0912 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "GeminiTextGenerator(connection_name='bigframes-dev.us.bigframes-rf-connection',\n", + " model_name='gemini-2.0-flash-exp',\n", + " session=)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.to_gbq(\"bigframes-dev.garrettwu.gemini_2_flash\", replace=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/noxfile.py b/noxfile.py index 341de704e5..cbc9d77558 100644 --- a/noxfile.py +++ b/noxfile.py @@ -749,6 +749,7 @@ def notebook(session: nox.Session): # bq_dataframes_llm_code_generation creates a bucket in the sample. "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow + "notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb", # Gemini 2.0 backend hasn't ready in prod. # TODO(b/366290533): to protect BQML quota "notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb", "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. diff --git a/tests/system/conftest.py b/tests/system/conftest.py index e1cbf02780..2234512a42 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -1358,4 +1358,4 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent # backend flakiness. # # Let's stop further clean up and leave it to later. - traceback.print_exception(exc) + traceback.print_exception(type(exc), exc, None) diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 37489d0e53..7602be2fca 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -165,25 +165,37 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu [ pytest.param(1, id="one", marks=pytest.mark.xfail(raises=ValueError)), pytest.param(2, id="two"), - pytest.param(4, id="four"), ], ) def test_cluster_by(session, text_embedding_generator, n_clusters): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( - ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), + ( + { + "Item": [ + "Orange", + "Cantaloupe", + "Watermelon", + "Chicken", + "Duck", + "Hen", + "Rooster", + ] + } + ), session=session, ) output_column = "cluster id" result = df.semantics.cluster_by( - "Product", + "Item", output_column, text_embedding_generator, n_clusters=n_clusters, ) assert output_column in result - assert len(result[output_column].unique()) == n_clusters + # In rare cases, it's possible to have fewer than K clusters due to randomness. + assert len(result[output_column].unique()) <= n_clusters def test_cluster_by_invalid_column(session, text_embedding_generator): diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 4bc1bd63be..1690e8ab4c 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -267,6 +267,7 @@ def test_text_embedding_generator_multi_cols_predict_success( "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", ), ) def test_create_load_gemini_text_generator_model( @@ -297,6 +298,7 @@ def test_create_load_gemini_text_generator_model( "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", ), ) @pytest.mark.flaky(retries=2) @@ -322,6 +324,7 @@ def test_gemini_text_generator_predict_default_params_success( "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", ), ) @pytest.mark.flaky(retries=2) @@ -349,6 +352,7 @@ def test_gemini_text_generator_predict_with_params_success( "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", ), ) @pytest.mark.flaky(retries=2) @@ -469,3 +473,16 @@ def test_palm2_text_embedding_deprecated(): llm.PaLM2TextEmbeddingGenerator() except (Exception): pass + + +@pytest.mark.parametrize( + "model_name", + ( + "gemini-1.5-pro-preview-0514", + "gemini-1.5-flash-preview-0514", + "gemini-2.0-flash-exp", + ), +) +def test_gemini_preview_model_warnings(model_name): + with pytest.warns(exceptions.PreviewWarning): + llm.GeminiTextGenerator(model_name=model_name) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 87e6e5e965..c48c07424d 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -987,7 +987,7 @@ def drop(self, labels) -> Index: labels (array-like or scalar): Returns: - Index: Will be same type as self. + bigframes.pandas.Index: Will be same type as self. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 19a26f60f3..36b3b75c36 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.28.0" +__version__ = "1.29.0"