From 52b7786c3a28da6c29e3ddf12629802215194ad9 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 5 Aug 2024 16:21:51 -0700 Subject: [PATCH 01/15] fix: Fix caching from generating row numbers in partial ordering mode (#872) --- bigframes/series.py | 2 +- bigframes/session/__init__.py | 4 +++- tests/system/small/test_unordered.py | 15 ++++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index d41553d0d7..069c469a85 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -641,7 +641,7 @@ def head(self, n: int = 5) -> Series: def tail(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[-n:]) - def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: + def peek(self, n: int = 5, *, force: bool = True) -> pandas.Series: """ Preview n arbitrary elements from the series without guarantees about row selection or ordering. diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index dc1da488a1..8ff5862bfc 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1997,8 +1997,10 @@ def _cache_with_session_awareness(self, array_value: core.ArrayValue) -> None: ) if len(cluster_cols) > 0: self._cache_with_cluster_cols(core.ArrayValue(target), cluster_cols) - else: + elif self._strictly_ordered: self._cache_with_offsets(core.ArrayValue(target)) + else: + self._cache_with_cluster_cols(core.ArrayValue(target), []) def _simplify_with_caching(self, array_value: core.ArrayValue): """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 9f85ec99f9..5e124d73cd 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -19,7 +19,11 @@ import bigframes.exceptions import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_unordered_mode_sql_no_hash(unordered_session): @@ -51,6 +55,15 @@ def test_unordered_mode_cache_aggregate(unordered_session): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_unordered_mode_series_peek(unordered_session): + pd_series = pd.Series([1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) + bf_series = bpd.Series(pd_series, session=unordered_session) + pd_result = pd_series.groupby(pd_series % 4).sum() + bf_peek = bf_series.groupby(bf_series % 4).sum().peek(2) + + assert_series_equal(bf_peek, pd_result.reindex(bf_peek.index)) + + def test_unordered_mode_single_aggregate(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) bf_df = bpd.DataFrame(pd_df, session=unordered_session) From 6e6f9df55d435afe0b3ade728ca06826e92a6ee6 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:22:53 -0700 Subject: [PATCH 02/15] docs: update streaming notebook (#887) --- notebooks/streaming/streaming_dataframe.ipynb | 203 +++++++++++------- 1 file changed, 127 insertions(+), 76 deletions(-) diff --git a/notebooks/streaming/streaming_dataframe.ipynb b/notebooks/streaming/streaming_dataframe.ipynb index d4cc255fa5..9b52c2d71e 100644 --- a/notebooks/streaming/streaming_dataframe.ipynb +++ b/notebooks/streaming/streaming_dataframe.ipynb @@ -17,10 +17,22 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'1.13.0'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import bigframes\n", - "import bigframes.streaming as bst" + "# make sure bigframes version >= 1.12.0\n", + "bigframes.__version__" ] }, { @@ -29,10 +41,46 @@ "metadata": {}, "outputs": [], "source": [ - "bigframes.options._bigquery_options.project = \"bigframes-load-testing\"\n", + "import bigframes.pandas as bpd\n", + "import bigframes.streaming as bst\n", + "bigframes.options._bigquery_options.project = \"bigframes-load-testing\" # Change to your own project ID\n", "job_id_prefix = \"test_streaming_\"" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 65df3a2f-cda8-405d-8b38-20a755f9b9a0 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'birds.penguins_bigtable_streaming'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Copy a table from the public dataset for streaming jobs. Any changes to the table can be reflected in the streaming destination.\n", + "df = bpd.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")\n", + "df.to_gbq(\"birds.penguins_bigtable_streaming\", if_exists=\"replace\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -43,13 +91,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/session/__init__.py:773: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + " warnings.warn(\n", "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", " warnings.warn(\n" ] @@ -61,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -82,7 +132,7 @@ { "data": { "text/html": [ - "Query job d57200dd-e6f1-42c7-876b-7f4a54994ae6 is DONE. 0 Bytes processed. Open Job" + "Query job dd20bd9d-4844-43e4-86ab-95759d7e673a is DONE. 2.7 kB processed. Open Job" ], "text/plain": [ "" @@ -104,7 +154,7 @@ { "data": { "text/html": [ - "Query job 1decce4a-eb32-49f4-8e47-7bda0220037a is DONE. 28.9 kB processed. Open Job" + "Query job 873e44ee-76e9-4254-83d3-04cf36fbd140 is DONE. 28.9 kB processed. Open Job" ], "text/plain": [ "" @@ -144,151 +194,151 @@ " 0\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3875\n", + " 3875.0\n", " \n", " \n", " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 2900\n", + " 2900.0\n", " \n", " \n", " 2\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3725\n", + " 3725.0\n", " \n", " \n", " 3\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 2975\n", + " 2975.0\n", " \n", " \n", " 4\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3050\n", + " 3050.0\n", " \n", " \n", " 5\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 2700\n", + " 2700.0\n", " \n", " \n", " 6\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3900\n", + " 3900.0\n", " \n", " \n", " 7\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3825\n", + " 3825.0\n", " \n", " \n", " 8\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3775\n", + " 3775.0\n", " \n", " \n", " 9\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3350\n", + " 3350.0\n", " \n", " \n", " 10\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3900\n", + " 3900.0\n", " \n", " \n", " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3650\n", + " 3650.0\n", " \n", " \n", " 12\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3200\n", + " 3200.0\n", " \n", " \n", " 13\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3650\n", + " 3650.0\n", " \n", " \n", " 14\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3700\n", + " 3700.0\n", " \n", " \n", " 15\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3800\n", + " 3800.0\n", " \n", " \n", " 16\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3950\n", + " 3950.0\n", " \n", " \n", " 17\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3350\n", + " 3350.0\n", " \n", " \n", " 18\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3100\n", + " 3100.0\n", " \n", " \n", " 19\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3750\n", + " 3750.0\n", " \n", " \n", " 20\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3550\n", + " 3550.0\n", " \n", " \n", " 21\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3400\n", + " 3400.0\n", " \n", " \n", " 22\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3450\n", + " 3450.0\n", " \n", " \n", " 23\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3600\n", + " 3600.0\n", " \n", " \n", " 24\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3650\n", + " 3650.0\n", " \n", " \n", "\n", @@ -297,37 +347,37 @@ ], "text/plain": [ " species rowkey body_mass_g\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3875\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 2900\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3725\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 2975\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3050\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 2700\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3900\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3825\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3775\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3350\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3900\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3650\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3200\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3700\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3800\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3950\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3350\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3100\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3750\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3550\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3400\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3450\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3600\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3875.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 2900.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3725.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 2975.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3050.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 2700.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3900.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3825.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3775.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3350.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3900.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3650.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3200.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3700.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3950.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3350.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3100.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3750.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3550.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3400.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3450.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3600.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0\n", "...\n", "\n", "[165 rows x 3 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -351,22 +401,22 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:338: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:341: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", " warnings.warn(\n" ] } ], "source": [ - "job = sdf.to_bigtable(instance=\"streaming-testing-instance\",\n", - " table=\"garrettwu-no-col-family\",\n", - " service_account_email=\"streaming-testing-admin@bigframes-load-testing.iam.gserviceaccount.com\",\n", + "job = sdf.to_bigtable(instance=\"streaming-testing-instance\", # Change to your own Bigtable instance name\n", + " table=\"garrettwu-no-col-family\", # Change to your own Bigtable table name\n", + " service_account_email=\"streaming-testing-admin@bigframes-load-testing.iam.gserviceaccount.com\", # Change to your own service account\n", " app_profile=None,\n", " truncate=True,\n", " overwrite=True,\n", @@ -378,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -397,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -406,7 +456,7 @@ "True" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -419,13 +469,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### PubSub\n", - "Create Pubsub streaming job" + "### Pub/Sub\n", + "Create Pub/Sub streaming job" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -438,27 +488,28 @@ } ], "source": [ + "# Pub/Sub requires a single column\n", "sdf = sdf[[\"rowkey\"]]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:453: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:456: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", " warnings.warn(\n" ] } ], "source": [ "job = sdf.to_pubsub(\n", - " topic=\"penguins\",\n", - " service_account_email=\"streaming-testing@bigframes-load-testing.iam.gserviceaccount.com\",\n", + " topic=\"penguins\", # Change to your own Pub/Sub topic ID\n", + " service_account_email=\"streaming-testing@bigframes-load-testing.iam.gserviceaccount.com\", # Change to your own service account\n", " job_id=None,\n", " job_id_prefix=job_id_prefix,\n", " )" @@ -466,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -485,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -494,7 +545,7 @@ "True" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } From eb6080460344aff2fabb7864536ea4fe24c5fbef Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 6 Aug 2024 15:02:19 -0700 Subject: [PATCH 03/15] perf: Generate SQL with fewer CTEs (#877) --- bigframes/core/compile/compiled.py | 135 +++++++++++++++++------------ bigframes/core/compile/compiler.py | 1 - bigframes/core/window_spec.py | 13 ++- tests/unit/session/test_session.py | 3 +- 4 files changed, 93 insertions(+), 59 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 538789f9d7..cae527931c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -24,6 +24,7 @@ import ibis.backends.bigquery as ibis_bigquery import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.operations as ibis_ops import ibis.expr.types as ibis_types import pandas @@ -36,7 +37,6 @@ from bigframes.core.ordering import ( ascending_over, encode_order_string, - IntegerEncoding, join_orderings, OrderingExpression, RowOrdering, @@ -71,19 +71,16 @@ def __init__( # Allow creating a DataFrame directly from an Ibis table expression. # TODO(swast): Validate that each column references the same table (or # no table for literal values). - self._columns = tuple(columns) + self._columns = tuple( + column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) else column + for column in columns + ) # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = { - ( - column.resolve(table) - # TODO(https://github.com/ibis-project/ibis/issues/7613): use - # public API to refer to Deferred type. - if isinstance(column, ibis.common.deferred.Deferred) - else column - ).get_name(): column - for column in self._columns - } + self._column_names = {column.get_name(): column for column in self._columns} @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: @@ -139,10 +136,6 @@ def projection( for expression, id in expression_id_pairs ] result = self._select(tuple(values)) # type: ignore - - # Need to reproject to convert ibis Scalar to ibis Column object - if any(exp_id[0].is_const for exp_id in expression_id_pairs): - result = result._reproject_to_table() return result @abc.abstractmethod @@ -300,8 +293,6 @@ def _to_ibis_expr( ArrayValue objects are sorted, so the following options are available to reflect this in the ibis expression. - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. * "string_encoded": An ordered string column is provided in output table. * "unordered": No ordering information will be provided in output. Only value columns are projected. @@ -355,6 +346,10 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> UnorderedIR: + if any(map(is_window, map(self._get_ibis_column, predicate.unbound_variables))): + # ibis doesn't support qualify syntax, so create CTE if filtering over window expression + # https://github.com/ibis-project/ibis/issues/9775 + return self._reproject_to_table().filter(predicate) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} condition = op_compiler.compile_expression(predicate, bindings) return self._filter(condition) @@ -785,15 +780,33 @@ def promote_offsets(self, col_id: str) -> OrderedIR: """ # Special case: offsets already exist ordering = self._ordering + # Case 1, already have offsets, just create column from them + if ordering.is_sequential and (ordering.total_order_col is not None): + expr_builder = self.builder() + expr_builder.columns = [ + self._compile_expression( + ordering.total_order_col.scalar_expression + ).name(col_id), + *self.columns, + ] + return expr_builder.build() + # Cannot nest analytic expressions, so reproject to cte first if needed. + # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) + # Seee: https://github.com/ibis-project/ibis/issues/9773 + can_directly_window = not any( + map(lambda x: is_literal(x) or is_window(x), self._ibis_order) + ) + if not can_directly_window: + return self._reproject_to_table().promote_offsets(col_id) - if (not ordering.is_sequential) or (not ordering.total_order_col): - return self._project_offsets().promote_offsets(col_id) + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + offsets = ibis.row_number().over(window) expr_builder = self.builder() expr_builder.columns = [ - self._compile_expression(ordering.total_order_col.scalar_expression).name( - col_id - ), *self.columns, + offsets.name(col_id), ] return expr_builder.build() @@ -806,7 +819,6 @@ def project_window_op( output_name=None, *, never_skip_nulls=False, - skip_reproject_unsafe: bool = False, ) -> OrderedIR: """ Creates a new expression based on this expression with unary operation applied to one column. @@ -815,8 +827,25 @@ def project_window_op( window_spec: a specification of the window over which to apply the operator output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided never_skip_nulls: will disable null skipping for operators that would otherwise do so - skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ + # Cannot nest analytic expressions, so reproject to cte first if needed. + # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) + # See: https://github.com/ibis-project/ibis/issues/9773 + used_exprs = map( + self._get_any_column, [column_name, *window_spec.all_referenced_columns] + ) + can_directly_window = not any( + map(lambda x: is_literal(x) or is_window(x), used_exprs) + ) + if not can_directly_window: + return self._reproject_to_table().project_window_op( + column_name, + op, + window_spec, + output_name, + never_skip_nulls=never_skip_nulls, + ) + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) window = self._ibis_window_from_spec( window_spec, require_total_order=op.uses_total_row_ordering @@ -861,8 +890,7 @@ def project_window_op( window_op = case_statement result = self._set_or_replace_by_id(output_name or column_name, window_op) - # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. - return result._reproject_to_table() if not skip_reproject_unsafe else result + return result def _reproject_to_table(self) -> OrderedIR: table = self._to_ibis_expr( @@ -944,7 +972,7 @@ def _to_ibis_expr( expose_hidden_cols: bool = False, fraction: Optional[float] = None, col_id_overrides: typing.Mapping[str, str] = {}, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + ordering_mode: Literal["string_encoded", "unordered"], order_col_name: Optional[str] = ORDER_ID_COLUMN, ): """ @@ -953,8 +981,7 @@ def _to_ibis_expr( ArrayValue objects are sorted, so the following options are available to reflect this in the ibis expression. - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. * "unordered": No ordering information will be provided in output. Only value columns are projected. @@ -981,10 +1008,9 @@ def _to_ibis_expr( """ assert ordering_mode in ( "string_encoded", - "offset_col", "unordered", ) - if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): + if expose_hidden_cols and ordering_mode in ("ordered_col"): raise ValueError( f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" ) @@ -1034,6 +1060,10 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> OrderedIR: + if any(map(is_window, map(self._get_ibis_column, predicate.unbound_variables))): + # ibis doesn't support qualify syntax, so create CTE if filtering over window expression + # https://github.com/ibis-project/ibis/issues/9775 + return self._reproject_to_table().filter(predicate) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} condition = op_compiler.compile_expression(predicate, bindings) return self._filter(condition) @@ -1174,27 +1204,6 @@ def _bake_ordering(self) -> OrderedIR: predicates=self._predicates, ) - def _project_offsets(self) -> OrderedIR: - """Create a new expression that contains offsets. Should only be executed when - offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = TotalOrdering( - ordering_value_columns=tuple([ascending_over(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return OrderedIR( - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - def _create_order_columns( self, ordering_mode: str, @@ -1202,9 +1211,7 @@ def _create_order_columns( expose_hidden_cols: bool, ) -> typing.Sequence[ibis_types.Value]: # Generate offsets if current ordering id semantics are not sufficiently strict - if ordering_mode == "offset_col": - return (self._create_offset_column().name(order_col_name),) - elif ordering_mode == "string_encoded": + if ordering_mode == "string_encoded": return (self._create_string_ordering_column().name(order_col_name),) elif expose_hidden_cols: return self._hidden_ordering_columns @@ -1328,6 +1335,22 @@ def build(self) -> OrderedIR: ) +def is_literal(column: ibis_types.Value) -> bool: + # Unfortunately, Literals in ibis are not "Columns"s and therefore can't be aggregated. + return not isinstance(column, ibis_types.Column) + + +def is_window(column: ibis_types.Value) -> bool: + matches = ( + (column) + .op() + .find_topmost( + lambda x: isinstance(x, (ibis_ops.WindowFunction, ibis_ops.Relation)) + ) + ) + return any(isinstance(op, ibis_ops.WindowFunction) for op in matches) + + def _reduce_predicate_list( predicate_list: typing.Collection[ibis_types.BooleanValue], ) -> ibis_types.BooleanValue: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index c7f8c5ab59..8fb1f7ab3a 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -304,7 +304,6 @@ def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): node.window_spec, node.output_name, never_skip_nulls=node.never_skip_nulls, - skip_reproject_unsafe=node.skip_reproject_unsafe, ) return result if ordered else result.to_unordered() diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index 57c57b451a..f011e2848d 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -14,7 +14,8 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Optional, Tuple, Union +import itertools +from typing import Optional, Set, Tuple, Union import bigframes.core.ordering as orderings @@ -162,3 +163,13 @@ def row_bounded(self): to calculate deterministically. """ return isinstance(self.bounds, RowsWindowBounds) + + @property + def all_referenced_columns(self) -> Set[str]: + """ + Return list of all variables reference ind the window. + """ + ordering_vars = itertools.chain.from_iterable( + item.scalar_expression.unbound_variables for item in self.ordering + ) + return set(itertools.chain(self.grouping_keys, ordering_vars)) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 31029abd67..2f7eaa567a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -246,7 +246,8 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, ) - # We expect a window operation because we specificaly requested a sequential index. + # We expect a window operation because we specificaly requested a sequential index and named it. + df.index.name = "named_index" generated_sql = df.sql.casefold() assert "OVER".casefold() in generated_sql assert "ROW_NUMBER()".casefold() in generated_sql From 171da6cb33165b49d46ea6528038342abd89e9fa Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:35:05 -0700 Subject: [PATCH 04/15] docs: add streaming html docs (#884) * docs: add streaming html docs * add beta icon * add beta icon in toc.yml --- bigframes/session/__init__.py | 4 +++- bigframes/streaming/dataframe.py | 11 ++++++++++- docs/reference/bigframes.streaming/dataframe.rst | 6 ++++++ docs/reference/bigframes.streaming/index.rst | 13 +++++++++++++ docs/reference/index.rst | 1 + docs/templates/toc.yml | 7 +++++++ 6 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 docs/reference/bigframes.streaming/dataframe.rst create mode 100644 docs/reference/bigframes.streaming/index.rst diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8ff5862bfc..2da788292b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -760,7 +760,9 @@ def read_gbq_table_streaming( ) -> streaming_dataframe.StreamingDataFrame: """Turn a BigQuery table into a StreamingDataFrame. - Note: The bigframes.streaming module is a preview feature, and subject to change. + .. note:: + + The bigframes.streaming module is a preview feature, and subject to change. **Examples:** diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py index 64a4898c57..b83ae5d822 100644 --- a/bigframes/streaming/dataframe.py +++ b/bigframes/streaming/dataframe.py @@ -191,7 +191,16 @@ def to_pubsub( @log_adapter.class_logger class StreamingDataFrame(StreamingBase): - __doc__ = _curate_df_doc(dataframe.DataFrame.__doc__) + __doc__ = ( + _curate_df_doc(dataframe.DataFrame.__doc__) + + """ + .. note:: + + The bigframes.streaming module is a preview feature, and subject to change. + + Currently only supports basic projection, filtering and preview operations. + """ + ) # Private constructor _create_key = object() diff --git a/docs/reference/bigframes.streaming/dataframe.rst b/docs/reference/bigframes.streaming/dataframe.rst new file mode 100644 index 0000000000..79ec64961c --- /dev/null +++ b/docs/reference/bigframes.streaming/dataframe.rst @@ -0,0 +1,6 @@ +bigframes.streaming.dataframe +============================= + +.. autoclass:: bigframes.streaming.dataframe.StreamingDataFrame + :members: + :inherited-members: diff --git a/docs/reference/bigframes.streaming/index.rst b/docs/reference/bigframes.streaming/index.rst new file mode 100644 index 0000000000..20a22072e5 --- /dev/null +++ b/docs/reference/bigframes.streaming/index.rst @@ -0,0 +1,13 @@ + +============================ +BigQuery DataFrame Streaming +============================ + +.. automodule:: bigframes.streaming + :members: + :undoc-members: + +.. toctree:: + :maxdepth: 2 + + dataframe diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 387e9b5ced..eb5a774b29 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -11,3 +11,4 @@ packages. bigframes.pandas/index bigframes.ml/index bigframes.bigquery/index + bigframes.streaming/index diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 95bded9a60..736ffba286 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -203,4 +203,11 @@ - name: BigQuery built-in functions uid: bigframes.bigquery name: bigframes.bigquery + - items: + - name: Overview + uid: bigframes.streaming + - name: StreamingDataFrame + uid: bigframes.streaming.dataframe.StreamingDataFrame + name: bigframes.streaming + status: beta name: BigQuery DataFrames From 8c352ce6991b3635222a285d1eee9a56cd57d0c6 Mon Sep 17 00:00:00 2001 From: mattyopl <90574735+mattyopl@users.noreply.github.com> Date: Tue, 6 Aug 2024 20:31:21 -0400 Subject: [PATCH 05/15] chore: clean up OWNERS (#886) - remove inactive users - add myself Co-authored-by: Matthew Laurence Chen --- OWNERS | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/OWNERS b/OWNERS index f86ad551ef..562ee0f19b 100644 --- a/OWNERS +++ b/OWNERS @@ -1,12 +1,8 @@ -ashleyxu@google.com -bmil@google.com chelsealin@google.com garrettwu@google.com -henryjsolberg@google.com -hormati@google.com huanc@google.com jiaxun@google.com -kemppeterson@google.com +mlaurencechen@google.com shobs@google.com swast@google.com -tbergeron@google.com +tbergeron@google.com \ No newline at end of file From 3dbf84bd1531c1f8d41ba57c2c38b3ba6abfb812 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:03:50 -0700 Subject: [PATCH 06/15] feat: bigframes.bigquery.json_extract (#868) * feat: bigframes.bigquery.json_extract * fixing tests --- bigframes/bigquery/__init__.py | 35 ++++++++++++++++++++ bigframes/core/compile/scalar_op_compiler.py | 12 +++++++ bigframes/operations/__init__.py | 16 +++++++++ tests/system/small/bigquery/test_json.py | 27 +++++++++++++++ 4 files changed, 90 insertions(+) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index ec26d14f33..7c409839b1 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -208,6 +208,41 @@ def json_set( return series +def json_extract( + series: series.Series, + json_path: str, +) -> series.Series: + """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` + value. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) + >>> bbq.json_extract(s, json_path="$.class") + 0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}" + dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) + + +# Search functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions + + def vector_search( base_table: str, column_to_search: str, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 67d0dac436..32749b32a6 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -922,6 +922,11 @@ def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): ).to_expr() +@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) +def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): + return json_extract(json_obj=x, json_path=op.json_path) + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" @@ -1549,6 +1554,13 @@ def json_set( """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" +@ibis.udf.scalar.builtin(name="json_extract") +def json_extract( + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str +) -> ibis_dtypes.JSON: + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + + @ibis.udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 523882c14e..4d4e40643d 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -602,6 +602,22 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +## JSON Ops +@dataclasses.dataclass(frozen=True) +class JSONExtract(UnaryOp): + name: typing.ClassVar[str] = "json_extract" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be an valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return input_type + + # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 9e0c06e0bd..059b8eea87 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -110,3 +110,30 @@ def test_json_set_w_invalid_value_type(): def test_json_set_w_invalid_series_type(): with pytest.raises(TypeError): bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)]) + + +def test_json_extract_from_json(): + s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) + actual = bbq.json_extract(s, "$.a.b") + # After the introduction of the JSON type, the output should be a JSON-formatted series. + expected = _get_series_from_json(["[1,2]", None, "0"]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_from_string(): + s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) + actual = bbq.json_extract(s, "$.a.b") + expected = _get_series_from_json(["[1,2]", None, "0"]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + check_names=False, + ) + + +def test_json_extract_w_invalid_series_type(): + with pytest.raises(TypeError): + bbq.json_extract(bpd.Series([1, 2]), "$.a") From 0603e62fcdf513ca4207909af4dbd8d036af0b0c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:38:58 -0700 Subject: [PATCH 07/15] chore: fix docs build on owlbot.py (#888) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: fix docs build on owlbot.py * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .kokoro/publish-docs.sh | 3 +++ owlbot.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.kokoro/publish-docs.sh b/.kokoro/publish-docs.sh index 233205d580..297b14ac90 100755 --- a/.kokoro/publish-docs.sh +++ b/.kokoro/publish-docs.sh @@ -58,5 +58,8 @@ python3.10 -m docuploader create-metadata \ cat docs.metadata +# Replace toc.yml template file +mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml + # upload docs python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}" diff --git a/owlbot.py b/owlbot.py index ddc578c3a2..f9d9410d6d 100644 --- a/owlbot.py +++ b/owlbot.py @@ -112,14 +112,14 @@ re.escape("# upload docs") + "\n" + re.escape( - 'python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' + 'python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' ) ), ( "# Replace toc.yml template file\n" + "mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml\n\n" + "# upload docs\n" - + 'python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' + + 'python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' ), ) From 3eb6a17a5823faf5ecba92cb9a554df74477871d Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 8 Aug 2024 12:03:30 -0700 Subject: [PATCH 08/15] docs: fix the `DisplayOptions` doc rendering (#893) --- third_party/bigframes_vendored/pandas/core/config_init.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 84ab90a322..4bca3f3c75 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -18,6 +18,7 @@ **Examples:** Define Repr mode to "deferred" will prevent job execution in repr. + >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") @@ -26,25 +27,32 @@ Computation deferred. Computation will process 28.9 kB Users can also get a dry run of the job by accessing the query_job property before they've run the job. This will return a dry run instance of the job they can inspect. + >>> df.query_job.total_bytes_processed 28947 User can execute the job by calling .to_pandas() + >>> # df.to_pandas() Reset repr_mode option + >>> bpd.options.display.repr_mode = "head" Can also set the progress_bar option to see the progress bar in terminal, + >>> bpd.options.display.progress_bar = "terminal" notebook, + >>> bpd.options.display.progress_bar = "notebook" or just remove it. + >>> bpd.options.display.progress_bar = None Setting to default value "auto" will detect and show progress bar automatically. + >>> bpd.options.display.progress_bar = "auto" Attributes: From 0c011a8212a8c0824a12c560b6d63048362275e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 8 Aug 2024 14:28:26 -0500 Subject: [PATCH 09/15] test: enable tests on Windows (#857) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: enable tests on Windows * use windows subdirectory for scripts * maybe fix for windows * fix working directory * use nox so test dependencies are installed * add 3.10 to system tests * disable system tests * add more goto errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .kokoro/continuous/windows.cfg | 3 +++ .kokoro/presubmit/windows.cfg | 3 +++ noxfile.py | 6 +++-- scripts/windows/build.bat | 38 ++++++++++++++++++++++++++++++++ scripts/windows/test.bat | 40 ++++++++++++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 .kokoro/continuous/windows.cfg create mode 100644 .kokoro/presubmit/windows.cfg create mode 100644 scripts/windows/build.bat create mode 100644 scripts/windows/test.bat diff --git a/.kokoro/continuous/windows.cfg b/.kokoro/continuous/windows.cfg new file mode 100644 index 0000000000..806986138d --- /dev/null +++ b/.kokoro/continuous/windows.cfg @@ -0,0 +1,3 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +build_file: "python-bigquery-dataframes/scripts/windows/build.bat" diff --git a/.kokoro/presubmit/windows.cfg b/.kokoro/presubmit/windows.cfg new file mode 100644 index 0000000000..806986138d --- /dev/null +++ b/.kokoro/presubmit/windows.cfg @@ -0,0 +1,3 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +build_file: "python-bigquery-dataframes/scripts/windows/build.bat" diff --git a/noxfile.py b/noxfile.py index d69c16e69c..c464b47270 100644 --- a/noxfile.py +++ b/noxfile.py @@ -62,7 +62,8 @@ UNIT_TEST_EXTRAS: List[str] = [] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.12"] +# 3.10 is needed for Windows tests. +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", @@ -100,7 +101,8 @@ "docfx", "unit", "unit_noextras", - "system", + "system-3.9", + "system-3.12", "cover", ] diff --git a/scripts/windows/build.bat b/scripts/windows/build.bat new file mode 100644 index 0000000000..d599702c98 --- /dev/null +++ b/scripts/windows/build.bat @@ -0,0 +1,38 @@ +@rem Copyright 2024 Google LLC +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. + +:; Change directory to repo root. +SET script_dir="%~dp0" +cd "%~dp0"\..\.. + +echo "Listing available Python versions' +py -0 || goto :error + +py -3.10 -m pip install --upgrade pip || goto :error +py -3.10 -m pip install --upgrade pip setuptools wheel || goto :error + +echo "Building Wheel" +py -3.10 -m pip wheel . --wheel-dir wheels || goto :error/ + +echo "Built wheel, now running tests." +call "%script_dir%"/test.bat 3.10 || goto :error + +echo "Windows build has completed successfully" + +:; https://stackoverflow.com/a/46813196/101923 +:; exit 0 +exit /b 0 + +:error +exit /b %errorlevel% diff --git a/scripts/windows/test.bat b/scripts/windows/test.bat new file mode 100644 index 0000000000..bcd605bd12 --- /dev/null +++ b/scripts/windows/test.bat @@ -0,0 +1,40 @@ +@rem Copyright 2024 Google LLC +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. + +@rem This test file runs for one Python version at a time, and is intended to +@rem be called from within the build loop. + +:; Change directory to repo root. +SET script_dir="%~dp0" +cd "%~dp0"\..\.. + +set PYTHON_VERSION=%1 +if "%PYTHON_VERSION%"=="" ( + echo "Python version was not provided, using Python 3.10" + set PYTHON_VERSION=3.10 +) + +py -%PYTHON_VERSION%-64 -m pip install nox || goto :error + +py -%PYTHON_VERSION%-64 -m nox -s unit-"%PYTHON_VERSION%" || goto :error + +:; TODO(b/358148440): enable system tests on windows +:; py -%PYTHON_VERSION%-64 -m nox -s system-"%PYTHON_VERSION%" || goto :error + +:; https://stackoverflow.com/a/46813196/101923 +:; exit 0 +exit /b 0 + +:error +exit /b %errorlevel% From 7117e33f2b6bc89ae8ce9b168d98bbcb21c08e52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 8 Aug 2024 16:31:23 -0500 Subject: [PATCH 10/15] chore: require Windows unit tests and re-enable owlbot check (#895) --- .github/sync-repo-settings.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index e098468da6..c2f3673fcc 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -7,8 +7,7 @@ branchProtectionRules: requiresCodeOwnerReviews: true requiresStrictStatusChecks: false requiredStatusCheckContexts: -# TODO(b/347075426): Restore owlbot as required check -# - 'OwlBot Post Processor' + - 'OwlBot Post Processor' - 'conventionalcommits.org' - 'cla/google' - 'docs' @@ -19,6 +18,7 @@ branchProtectionRules: - 'unit (3.12)' - 'cover' - 'Kokoro presubmit' + - 'Kokoro windows' permissionRules: - team: actools-python permission: admin From e0b11bc8c038db7b950b1653ed4cd44a6246c713 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 9 Aug 2024 16:00:36 -0700 Subject: [PATCH 11/15] perf: Speed up compilation by reducing redundant type normalization (#896) --- bigframes/bigquery/__init__.py | 2 +- bigframes/core/compile/compiled.py | 23 +++-------------------- bigframes/core/compile/ibis_types.py | 1 + tests/system/small/bigquery/test_json.py | 20 +++++++++++--------- 4 files changed, 16 insertions(+), 30 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 7c409839b1..fb9503dc72 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -224,7 +224,7 @@ def json_extract( >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") - 0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}" + 0 {"students":[{"id":5},{"id":12}]} dtype: string Args: diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cae527931c..5492502f21 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -152,12 +152,7 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast( - ibis_types.Value, - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - self._column_names[key] - ), - ) + return typing.cast(ibis_types.Value, self._column_names[key]) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -327,12 +322,7 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(column) - for column in columns - ) + table = self._table.select(columns) base_table = table if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) @@ -1039,14 +1029,7 @@ def _to_ibis_expr( # Make sure we don't have any unbound (deferred) columns. table = self._table.select(columns) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = table.select( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - table[column] - ) - for column in table.columns - ) + table = table.select(table[column] for column in table.columns) base_table = table if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index f3221f605f..0b3038c9c7 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -208,6 +208,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: name = value.get_name() if ibis_type.is_json(): value = vendored_ibis_ops.ToJsonString(value).to_expr() + value = value.case().when("null", ibis.null()).else_(value).end() return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 059b8eea87..18ccadd9f5 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -23,11 +23,13 @@ def _get_series_from_json(json_data): + # Note: converts None to sql "null" and not to json none. + values = [ + f"JSON '{json.dumps(data)}'" if data is not None else "NULL" + for data in json_data + ] sql = " UNION ALL ".join( - [ - f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data" - for id, data in enumerate(json_data) - ] + [f"SELECT {id} AS id, {value} AS data" for id, value in enumerate(values)] ) df = bpd.read_gbq(sql).set_index("id").sort_index() return df["data"] @@ -114,19 +116,19 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) - actual = bbq.json_extract(s, "$.a.b") + actual = bbq.json_extract(s, "$.a.b").to_pandas() # After the introduction of the JSON type, the output should be a JSON-formatted series. - expected = _get_series_from_json(["[1,2]", None, "0"]) + expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual, + expected, ) def test_json_extract_from_string(): s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) actual = bbq.json_extract(s, "$.a.b") - expected = _get_series_from_json(["[1,2]", None, "0"]) + expected = _get_series_from_json([[1, 2], None, 0]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), From 991bb0a25e1e424de38abd065f9d79ab20c24ed2 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 12 Aug 2024 14:32:53 -0700 Subject: [PATCH 12/15] refactor: reorganize `remote_function` code for readability (#885) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: reorganize `remote_function` code for readability * refactor out RemoteFunctionSession * rename newly introduced modules to make them private symbolically --------- Co-authored-by: Tim Sweña (Swast) --- .../functions/_remote_function_client.py | 476 +++++++ .../functions/_remote_function_session.py | 546 ++++++++ bigframes/functions/_utils.py | 214 +++ bigframes/functions/remote_function.py | 1174 +---------------- bigframes/pandas/__init__.py | 4 +- bigframes/session/__init__.py | 3 +- tests/system/large/test_remote_function.py | 10 +- tests/system/small/test_remote_function.py | 7 +- tests/system/utils.py | 4 +- 9 files changed, 1261 insertions(+), 1177 deletions(-) create mode 100644 bigframes/functions/_remote_function_client.py create mode 100644 bigframes/functions/_remote_function_session.py create mode 100644 bigframes/functions/_utils.py diff --git a/bigframes/functions/_remote_function_client.py b/bigframes/functions/_remote_function_client.py new file mode 100644 index 0000000000..6ef482ecda --- /dev/null +++ b/bigframes/functions/_remote_function_client.py @@ -0,0 +1,476 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import inspect +import logging +import os +import random +import shutil +import string +import sys +import tempfile +from typing import cast, Tuple, TYPE_CHECKING + +import requests + +from bigframes import constants +import bigframes.functions.remote_function_template + +if TYPE_CHECKING: + from bigframes.session import Session + +import google.api_core.exceptions +import google.api_core.retry +from google.cloud import bigquery, functions_v2 + +from . import _utils + +logger = logging.getLogger(__name__) + + +class RemoteFunctionClient: + # Wait time (in seconds) for an IAM binding to take effect after creation + _iam_wait_seconds = 120 + + def __init__( + self, + gcp_project_id, + cloud_function_region, + cloud_functions_client, + bq_location, + bq_dataset, + bq_client, + bq_connection_id, + bq_connection_manager, + cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, + *, + session: Session, + ): + self._gcp_project_id = gcp_project_id + self._cloud_function_region = cloud_function_region + self._cloud_functions_client = cloud_functions_client + self._bq_location = bq_location + self._bq_dataset = bq_dataset + self._bq_client = bq_client + self._bq_connection_id = bq_connection_id + self._bq_connection_manager = bq_connection_manager + self._cloud_function_service_account = cloud_function_service_account + self._cloud_function_kms_key_name = cloud_function_kms_key_name + self._cloud_function_docker_repository = cloud_function_docker_repository + self._session = session + + def create_bq_remote_function( + self, + input_args, + input_types, + output_type, + endpoint, + bq_function_name, + max_batching_rows, + ): + """Create a BigQuery remote function given the artifacts of a user defined + function and the http endpoint of a corresponding cloud function.""" + if self._bq_connection_manager: + self._bq_connection_manager.create_bq_connection( + self._gcp_project_id, + self._bq_location, + self._bq_connection_id, + "run.invoker", + ) + + # Create BQ function + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 + bq_function_args = [] + bq_function_return_type = output_type + + # We are expecting the input type annotations to be 1:1 with the input args + for name, type_ in zip(input_args, input_types): + bq_function_args.append(f"{name} {type_}") + + remote_function_options = { + "endpoint": endpoint, + "max_batching_rows": max_batching_rows, + } + + remote_function_options_str = ", ".join( + [ + f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}" + for key, val in remote_function_options.items() + if val is not None + ] + ) + + create_function_ddl = f""" + CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) + RETURNS {bq_function_return_type} + REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` + OPTIONS ({remote_function_options_str})""" + + logger.info(f"Creating BQ remote function: {create_function_ddl}") + + # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and + # create it + dataset = bigquery.Dataset( + bigquery.DatasetReference.from_string( + self._bq_dataset, default_project=self._gcp_project_id + ) + ) + dataset.location = self._bq_location + try: + # This check does not require bigquery.datasets.create IAM + # permission. So, if the data set already exists, then user can work + # without having that permission. + self._bq_client.get_dataset(dataset) + except google.api_core.exceptions.NotFound: + # This requires bigquery.datasets.create IAM permission + self._bq_client.create_dataset(dataset, exists_ok=True) + + # TODO(swast): plumb through the original, user-facing api_name. + _, query_job = self._session._start_query(create_function_ddl) + logger.info(f"Created remote function {query_job.ddl_target_routine}") + + def get_cloud_function_fully_qualified_parent(self): + "Get the fully qualilfied parent for a cloud function." + return self._cloud_functions_client.common_location_path( + self._gcp_project_id, self._cloud_function_region + ) + + def get_cloud_function_fully_qualified_name(self, name): + "Get the fully qualilfied name for a cloud function." + return self._cloud_functions_client.function_path( + self._gcp_project_id, self._cloud_function_region, name + ) + + def get_remote_function_fully_qualilfied_name(self, name): + "Get the fully qualilfied name for a BQ remote function." + return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" + + def get_cloud_function_endpoint(self, name): + """Get the http endpoint of a cloud function if it exists.""" + fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) + try: + response = self._cloud_functions_client.get_function( + name=fully_qualified_name + ) + return response.service_config.uri + except google.api_core.exceptions.NotFound: + pass + return None + + def generate_cloud_function_code( + self, + def_, + directory, + *, + input_types: Tuple[str], + output_type: str, + package_requirements=None, + is_row_processor=False, + ): + """Generate the cloud function code for a given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ + + # requirements.txt + if package_requirements: + requirements_txt = os.path.join(directory, "requirements.txt") + with open(requirements_txt, "w") as f: + f.write("\n".join(package_requirements)) + + # main.py + entry_point = bigframes.functions.remote_function_template.generate_cloud_function_main_code( + def_, + directory, + input_types=input_types, + output_type=output_type, + is_row_processor=is_row_processor, + ) + return entry_point + + def create_cloud_function( + self, + def_, + cf_name, + *, + input_types: Tuple[str], + output_type: str, + package_requirements=None, + timeout_seconds=600, + max_instance_count=None, + is_row_processor=False, + vpc_connector=None, + memory_mib=1024, + ): + """Create a cloud function from the given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ + + # Build and deploy folder structure containing cloud function + with tempfile.TemporaryDirectory() as directory: + entry_point = self.generate_cloud_function_code( + def_, + directory, + package_requirements=package_requirements, + input_types=input_types, + output_type=output_type, + is_row_processor=is_row_processor, + ) + archive_path = shutil.make_archive(directory, "zip", directory) + + # We are creating cloud function source code from the currently running + # python version. Use the same version to deploy. This is necessary + # because cloudpickle serialization done in one python version and + # deserialization done in another python version doesn't work. + # TODO(shobs): Figure out how to achieve version compatibility, specially + # when pickle (internally used by cloudpickle) guarantees that: + # https://docs.python.org/3/library/pickle.html#:~:text=The%20pickle%20serialization%20format%20is,unique%20breaking%20change%20language%20boundary. + python_version = "python{}{}".format( + sys.version_info.major, sys.version_info.minor + ) + + # Determine an upload URL for user code + upload_url_request = functions_v2.GenerateUploadUrlRequest( + kms_key_name=self._cloud_function_kms_key_name + ) + upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() + upload_url_response = self._cloud_functions_client.generate_upload_url( + request=upload_url_request + ) + + # Upload the code to GCS + with open(archive_path, "rb") as f: + response = requests.put( + upload_url_response.upload_url, + data=f, + headers={"content-type": "application/zip"}, + ) + if response.status_code != 200: + raise RuntimeError( + "Failed to upload user code. code={}, reason={}, text={}".format( + response.status_code, response.reason, response.text + ) + ) + + # Deploy Cloud Function + create_function_request = functions_v2.CreateFunctionRequest() + create_function_request.parent = ( + self.get_cloud_function_fully_qualified_parent() + ) + create_function_request.function_id = cf_name + function = functions_v2.Function() + function.name = self.get_cloud_function_fully_qualified_name(cf_name) + function.build_config = functions_v2.BuildConfig() + function.build_config.runtime = python_version + function.build_config.entry_point = entry_point + function.build_config.source = functions_v2.Source() + function.build_config.source.storage_source = functions_v2.StorageSource() + function.build_config.source.storage_source.bucket = ( + upload_url_response.storage_source.bucket + ) + function.build_config.source.storage_source.object_ = ( + upload_url_response.storage_source.object_ + ) + function.build_config.docker_repository = ( + self._cloud_function_docker_repository + ) + function.service_config = functions_v2.ServiceConfig() + if memory_mib is not None: + function.service_config.available_memory = f"{memory_mib}Mi" + if timeout_seconds is not None: + if timeout_seconds > 1200: + raise ValueError( + "BigQuery remote function can wait only up to 20 minutes" + ", see for more details " + "https://cloud.google.com/bigquery/quotas#remote_function_limits." + ) + function.service_config.timeout_seconds = timeout_seconds + if max_instance_count is not None: + function.service_config.max_instance_count = max_instance_count + if vpc_connector is not None: + function.service_config.vpc_connector = vpc_connector + function.service_config.service_account_email = ( + self._cloud_function_service_account + ) + function.kms_key_name = self._cloud_function_kms_key_name + create_function_request.function = function + + # Create the cloud function and wait for it to be ready to use + try: + operation = self._cloud_functions_client.create_function( + request=create_function_request + ) + operation.result() + + # Cleanup + os.remove(archive_path) + except google.api_core.exceptions.AlreadyExists: + # If a cloud function with the same name already exists, let's + # update it + update_function_request = functions_v2.UpdateFunctionRequest() + update_function_request.function = function + operation = self._cloud_functions_client.update_function( + request=update_function_request + ) + operation.result() + + # Fetch the endpoint of the just created function + endpoint = self.get_cloud_function_endpoint(cf_name) + if not endpoint: + raise ValueError( + f"Couldn't fetch the http endpoint. {constants.FEEDBACK_LINK}" + ) + + logger.info( + f"Successfully created cloud function {cf_name} with uri ({endpoint})" + ) + return endpoint + + def provision_bq_remote_function( + self, + def_, + input_types, + output_type, + reuse, + name, + package_requirements, + max_batching_rows, + cloud_function_timeout, + cloud_function_max_instance_count, + is_row_processor, + cloud_function_vpc_connector, + cloud_function_memory_mib, + ): + """Provision a BigQuery remote function.""" + # Augment user package requirements with any internal package + # requirements + package_requirements = _utils._get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code + function_hash = _utils._get_hash(def_, package_requirements) + + # If reuse of any existing function with the same name (indicated by the + # same hash of its source code) is not intended, then attach a unique + # suffix to the intended function name to make it unique. + uniq_suffix = None + if not reuse: + # use 4 digits as a unique suffix which should suffice for + # uniqueness per session + uniq_suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=4) + ) + + # Derive the name of the cloud function underlying the intended BQ + # remote function. Use the session id to identify the GCF for unnamed + # functions. The named remote functions are treated as a persistant + # artifacts, so let's keep them independent of session id, which also + # makes their naming more stable for the same udf code + session_id = None if name else self._session.session_id + cloud_function_name = _utils.get_cloud_function_name( + function_hash, session_id, uniq_suffix + ) + cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) + + # Create the cloud function if it does not exist + if not cf_endpoint: + cf_endpoint = self.create_cloud_function( + def_, + cloud_function_name, + input_types=input_types, + output_type=output_type, + package_requirements=package_requirements, + timeout_seconds=cloud_function_timeout, + max_instance_count=cloud_function_max_instance_count, + is_row_processor=is_row_processor, + vpc_connector=cloud_function_vpc_connector, + memory_mib=cloud_function_memory_mib, + ) + else: + logger.info(f"Cloud function {cloud_function_name} already exists.") + + # Derive the name of the remote function + remote_function_name = name + if not remote_function_name: + remote_function_name = _utils.get_remote_function_name( + function_hash, self._session.session_id, uniq_suffix + ) + rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) + + # Create the BQ remote function in following circumstances: + # 1. It does not exist + # 2. It exists but the existing remote function has different + # configuration than intended + created_new = False + if not rf_endpoint or ( + rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id + ): + input_args = inspect.getargs(def_.__code__).args + if len(input_args) != len(input_types): + raise ValueError( + "Exactly one type should be provided for every input arg." + ) + self.create_bq_remote_function( + input_args, + input_types, + output_type, + cf_endpoint, + remote_function_name, + max_batching_rows, + ) + + created_new = True + else: + logger.info(f"Remote function {remote_function_name} already exists.") + + return remote_function_name, cloud_function_name, created_new + + def get_remote_function_specs(self, remote_function_name): + """Check whether a remote function already exists for the udf.""" + http_endpoint = None + bq_connection = None + routines = self._bq_client.list_routines( + f"{self._gcp_project_id}.{self._bq_dataset}" + ) + try: + for routine in routines: + routine = cast(bigquery.Routine, routine) + if routine.reference.routine_id == remote_function_name: + rf_options = routine.remote_function_options + if rf_options: + http_endpoint = rf_options.endpoint + bq_connection = rf_options.connection + if bq_connection: + bq_connection = os.path.basename(bq_connection) + break + except google.api_core.exceptions.NotFound: + # The dataset might not exist, in which case the http_endpoint doesn't, either. + # Note: list_routines doesn't make an API request until we iterate on the response object. + pass + return (http_endpoint, bq_connection) diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py new file mode 100644 index 0000000000..0ab19ca353 --- /dev/null +++ b/bigframes/functions/_remote_function_session.py @@ -0,0 +1,546 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import collections.abc +import inspect +import sys +import threading +from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union +import warnings + +import google.api_core.exceptions +from google.cloud import ( + bigquery, + bigquery_connection_v1, + functions_v2, + resourcemanager_v3, +) + +from bigframes import clients, constants + +if TYPE_CHECKING: + from bigframes.session import Session + +import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes +import ibis +import pandas + +from . import _remote_function_client as rf_client +from . import _utils + + +class RemoteFunctionSession: + """Session to manage remote functions.""" + + def __init__(self): + # Session level mapping of remote function artifacts + self._temp_artifacts: Dict[str, str] = dict() + + # Lock to synchronize the update of the session artifacts + self._artifacts_lock = threading.Lock() + + def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): + """Update remote function artifacts in the current session.""" + with self._artifacts_lock: + self._temp_artifacts[bqrf_routine] = gcf_path + + def clean_up( + self, + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + session_id: str, + ): + """Delete remote function artifacts in the current session.""" + with self._artifacts_lock: + for bqrf_routine, gcf_path in self._temp_artifacts.items(): + # Let's accept the possibility that the remote function may have + # been deleted directly by the user + bqclient.delete_routine(bqrf_routine, not_found_ok=True) + + # Let's accept the possibility that the cloud function may have + # been deleted directly by the user + try: + gcfclient.delete_function(name=gcf_path) + except google.api_core.exceptions.NotFound: + pass + + self._temp_artifacts.clear() + + # Inspired by @udf decorator implemented in ibis-bigquery package + # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py + # which has moved as @js to the ibis package + # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py + def remote_function( + self, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, + bigquery_connection_client: Optional[ + bigquery_connection_v1.ConnectionServiceClient + ] = None, + cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + reuse: bool = True, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, + cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, + cloud_function_memory_mib: Optional[int] = 1024, + ): + """Decorator to turn a user defined function into a BigQuery remote function. + + .. deprecated:: 0.0.1 + This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. + + .. note:: + Please make sure following is setup before using this API: + + 1. Have the below APIs enabled for your project: + + * BigQuery Connection API + * Cloud Functions API + * Cloud Run API + * Cloud Build API + * Artifact Registry API + * Cloud Resource Manager API + + This can be done from the cloud console (change `PROJECT_ID` to yours): + https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID + + Or from the gcloud CLI: + + `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` + + 2. Have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + * BigQuery Connection Admin (roles/bigquery.connectionAdmin) + * Cloud Functions Developer (roles/cloudfunctions.developer) + * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` + * Storage Object Viewer (roles/storage.objectViewer) + * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) + + 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: + + 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection + 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + + Alternatively, the IAM could also be setup via the gcloud CLI: + + `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. + + Args: + input_types (None, type, or sequence(type)): + For scalar user defined function it should be the input type or + sequence of input types. For row processing user defined function, + type `Series` should be specified. + output_type (Optional[type]): + Data type of the output in the user defined function. + session (bigframes.Session, Optional): + BigQuery DataFrames session to use for getting default project, + dataset and BigQuery connection. + bigquery_client (google.cloud.bigquery.Client, Optional): + Client to use for BigQuery operations. If this param is not provided + then bigquery client from the session would be used. + bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): + Client to use for BigQuery connection operations. If this param is + not provided then bigquery connection client from the session would + be used. + cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): + Client to use for cloud functions operations. If this param is not + provided then the functions client from the session would be used. + resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): + Client to use for cloud resource management operations, e.g. for + getting and setting IAM roles on cloud resources. If this param is + not provided then resource manager client from the session would be + used. + dataset (str, Optional.): + Dataset in which to create a BigQuery remote function. It should be in + `.` or `` format. If this + parameter is not provided then session dataset id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection in the form of `CONNECTION_ID` or + `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. + If this param is not provided then the bigquery connection from the session + would be used. If it is pre created in the same location as the + `bigquery_client.location` then it would be used, otherwise it is created + dynamically using the `bigquery_connection_client` assuming the user has necessary + priviliges. The PROJECT_ID should be the same as the BigQuery connection project. + reuse (bool, Optional): + Reuse the remote function if already exists. + `True` by default, which will result in reusing an existing remote + function and corresponding cloud function that was previously + created (if any) for the same udf. + Please note that for an unnamed (i.e. created without an explicit + `name` argument) remote function, the BigQuery DataFrames + session id is attached in the cloud artifacts names. So for the + effective reuse across the sessions it is recommended to create + the remote function with an explicit `name`. + Setting it to `False` would force creating a unique remote function. + If the required remote function does not exist then it would be + created irrespective of this param. + name (str, Optional): + Explicit name of the persisted BigQuery remote function. Use it with + caution, because two users working in the same project and dataset + could overwrite each other's remote functions if they use the same + persistent name. When an explicit name is provided, any session + specific clean up (``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. + packages (str[], Optional): + Explicit name of the external package dependencies. Each dependency + is added to the `requirements.txt` as is, and can be of the form + supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + cloud_function_service_account (str, Optional): + Service account to use for the cloud functions. If not provided then + the default service account would be used. See + https://cloud.google.com/functions/docs/securing/function-identity + for more details. Please make sure the service account has the + necessary IAM permissions configured as described in + https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. + cloud_function_kms_key_name (str, Optional): + Customer managed encryption key to protect cloud functions and + related data at rest. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. + Read https://cloud.google.com/functions/docs/securing/cmek for + more details including granting necessary service accounts + access to the key. + cloud_function_docker_repository (str, Optional): + Docker repository created with the same encryption key as + `cloud_function_kms_key_name` to store encrypted artifacts + created to support the cloud function. This is of the format + projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. + For more details see + https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. + cloud_function_timeout (int, Optional): + The maximum amount of time (in seconds) BigQuery should wait for + the cloud function to return a response. See for more details + https://cloud.google.com/functions/docs/configuring/timeout. + Please note that even though the cloud function (2nd gen) itself + allows seeting up to 60 minutes of timeout, BigQuery remote + function can wait only up to 20 minutes, see for more details + https://cloud.google.com/bigquery/quotas#remote_function_limits. + By default BigQuery DataFrames uses a 10 minute timeout. `None` + can be passed to let the cloud functions default timeout take effect. + cloud_function_max_instances (int, Optional): + The maximumm instance count for the cloud function created. This + can be used to control how many cloud function instances can be + active at max at any given point of time. Lower setting can help + control the spike in the billing. Higher setting can help + support processing larger scale data. When not specified, cloud + function's default setting applies. For more details see + https://cloud.google.com/functions/docs/configuring/max-instances. + cloud_function_vpc_connector (str, Optional): + The VPC connector you would like to configure for your cloud + function. This is useful if your code needs access to data or + service(s) that are on a VPC network. See for more details + https://cloud.google.com/functions/docs/networking/connecting-vpc. + cloud_function_memory_mib (int, Optional): + The amounts of memory (in mebibytes) to allocate for the cloud + function (2nd gen) created. This also dictates a corresponding + amount of allocated CPU for the function. By default a memory of + 1024 MiB is set for the cloud functions created to support + BigQuery DataFrames remote function. If you want to let the + default memory of cloud functions be allocated, pass `None`. See + for more details + https://cloud.google.com/functions/docs/configuring/memory. + """ + # Some defaults may be used from the session if not provided otherwise + import bigframes.exceptions as bf_exceptions + import bigframes.pandas as bpd + import bigframes.series as bf_series + import bigframes.session + + session = cast(bigframes.session.Session, session or bpd.get_global_session()) + + # A BigQuery client is required to perform BQ operations + if not bigquery_client: + bigquery_client = session.bqclient + if not bigquery_client: + raise ValueError( + "A bigquery client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A BigQuery connection client is required to perform BQ connection operations + if not bigquery_connection_client: + bigquery_connection_client = session.bqconnectionclient + if not bigquery_connection_client: + raise ValueError( + "A bigquery connection client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A cloud functions client is required to perform cloud functions operations + if not cloud_functions_client: + cloud_functions_client = session.cloudfunctionsclient + if not cloud_functions_client: + raise ValueError( + "A cloud functions client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A resource manager client is required to get/set IAM operations + if not resource_manager_client: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # BQ remote function must be persisted, for which we need a dataset + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. + if dataset: + dataset_ref = bigquery.DatasetReference.from_string( + dataset, default_project=bigquery_client.project + ) + else: + dataset_ref = session._anonymous_dataset + + bq_location, cloud_function_region = _utils.get_remote_function_locations( + bigquery_client.location + ) + + # A connection is required for BQ remote function + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function + if not bigquery_connection: + bigquery_connection = session._bq_connection # type: ignore + + bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection, + default_project=dataset_ref.project, + default_location=bq_location, + ) + # Guaranteed to be the form of .. + ( + gcp_project_id, + bq_connection_location, + bq_connection_id, + ) = bigquery_connection.split(".") + if gcp_project_id.casefold() != dataset_ref.project.casefold(): + raise ValueError( + "The project_id does not match BigQuery connection gcp_project_id: " + f"{dataset_ref.project}." + ) + if bq_connection_location.casefold() != bq_location.casefold(): + raise ValueError( + "The location does not match BigQuery connection location: " + f"{bq_location}." + ) + + # If any CMEK is intended then check that a docker repository is also specified + if ( + cloud_function_kms_key_name is not None + and cloud_function_docker_repository is None + ): + raise ValueError( + "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." + " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" + ) + + bq_connection_manager = session.bqconnectionmanager + + def wrapper(func): + nonlocal input_types, output_type + + if not callable(func): + raise TypeError("f must be callable, got {}".format(func)) + + if sys.version_info >= (3, 10): + # Add `eval_str = True` so that deferred annotations are turned into their + # corresponding type objects. Need Python 3.10 for eval_str parameter. + # https://docs.python.org/3/library/inspect.html#inspect.signature + signature_kwargs: Mapping[str, Any] = {"eval_str": True} + else: + signature_kwargs = {} + + signature = inspect.signature( + func, + **signature_kwargs, + ) + + # Try to get input types via type annotations. + if input_types is None: + input_types = [] + for parameter in signature.parameters.values(): + if (param_type := parameter.annotation) is inspect.Signature.empty: + raise ValueError( + "'input_types' was not set and parameter " + f"'{parameter.name}' is missing a type annotation. " + "Types are required to use @remote_function." + ) + input_types.append(param_type) + elif not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + + if output_type is None: + if ( + output_type := signature.return_annotation + ) is inspect.Signature.empty: + raise ValueError( + "'output_type' was not set and function is missing a " + "return type annotation. Types are required to use " + "@remote_function." + ) + + # The function will actually be receiving a pandas Series, but allow both + # BigQuery DataFrames and pandas object types for compatibility. + is_row_processor = False + if len(input_types) == 1 and ( + (input_type := input_types[0]) == bf_series.Series + or input_type == pandas.Series + ): + warnings.warn( + "input_types=Series is in preview.", + stacklevel=1, + category=bf_exceptions.PreviewWarning, + ) + + # we will model the row as a json serialized string containing the data + # and the metadata representing the row + input_types = [str] + is_row_processor = True + elif isinstance(input_types, type): + input_types = [input_types] + + # TODO(b/340898611): fix type error + ibis_signature = _utils.ibis_signature_from_python_signature( + signature, input_types, output_type # type: ignore + ) + + remote_function_client = rf_client.RemoteFunctionClient( + dataset_ref.project, + cloud_function_region, + cloud_functions_client, + bq_location, + dataset_ref.dataset_id, + bigquery_client, + bq_connection_id, + bq_connection_manager, + cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, + session=session, # type: ignore + ) + + # In the unlikely case where the user is trying to re-deploy the same + # function, cleanup the attributes we add below, first. This prevents + # the pickle from having dependencies that might not otherwise be + # present such as ibis or pandas. + def try_delattr(attr): + try: + delattr(func, attr) + except AttributeError: + pass + + try_delattr("bigframes_cloud_function") + try_delattr("bigframes_remote_function") + try_delattr("input_dtypes") + try_delattr("output_dtype") + try_delattr("is_row_processor") + try_delattr("ibis_node") + + ( + rf_name, + cf_name, + created_new, + ) = remote_function_client.provision_bq_remote_function( + func, + input_types=tuple( + third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) + for type_ in ibis_signature.input_types + ), + output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( + ibis_signature.output_type + ), + reuse=reuse, + name=name, + package_requirements=packages, + max_batching_rows=max_batching_rows, + cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instance_count=cloud_function_max_instances, + is_row_processor=is_row_processor, + cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_memory_mib=cloud_function_memory_mib, + ) + + # TODO: Move ibis logic to compiler step + node = ibis.udf.scalar.builtin( + func, + name=rf_name, + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) + func.bigframes_cloud_function = ( + remote_function_client.get_cloud_function_fully_qualified_name(cf_name) + ) + func.bigframes_remote_function = ( + remote_function_client.get_remote_function_fully_qualilfied_name( + rf_name + ) + ) + func.input_dtypes = tuple( + [ + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + input_type + ) + for input_type in ibis_signature.input_types + ] + ) + func.output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_signature.output_type + ) + ) + func.is_row_processor = is_row_processor + func.ibis_node = node + + # If a new remote function was created, update the cloud artifacts + # created in the session. This would be used to clean up any + # resources in the session. Note that we need to do this only for + # the case where an explicit name was not provided by the user and + # we used an internal name. For the cases where the user provided an + # explicit name, we are assuming that the user wants to persist them + # with that name and would directly manage their lifecycle. + if created_new and (not name): + self._update_temp_artifacts( + func.bigframes_remote_function, func.bigframes_cloud_function + ) + return func + + return wrapper diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py new file mode 100644 index 0000000000..537473bed8 --- /dev/null +++ b/bigframes/functions/_utils.py @@ -0,0 +1,214 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import hashlib +import inspect +from typing import cast, List, NamedTuple, Optional, Sequence, Set + +import cloudpickle +import google.api_core.exceptions +from google.cloud import bigquery, functions_v2 +import ibis.expr.datatypes.core +import numpy +import pandas +import pyarrow + +import bigframes.core.compile.ibis_types + +# Naming convention for the remote function artifacts +_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" +_BQ_FUNCTION_NAME_SEPERATOR = "_" +_GCF_FUNCTION_NAME_SEPERATOR = "-" + +# Protocol version 4 is available in python version 3.4 and above +# https://docs.python.org/3/library/pickle.html#data-stream-format +_pickle_protocol_version = 4 + + +def get_remote_function_locations(bq_location): + """Get BQ location and cloud functions region given a BQ client.""" + # TODO(shobs, b/274647164): Find the best way to determine default location. + # For now let's assume that if no BQ location is set in the client then it + # defaults to US multi region + bq_location = bq_location.lower() if bq_location else "us" + + # Cloud function should be in the same region as the bigquery remote function + cloud_function_region = bq_location + + # BigQuery has multi region but cloud functions does not. + # Any region in the multi region that supports cloud functions should work + # https://cloud.google.com/functions/docs/locations + if bq_location == "us": + cloud_function_region = "us-central1" + elif bq_location == "eu": + cloud_function_region = "europe-west1" + + return bq_location, cloud_function_region + + +def _get_updated_package_requirements( + package_requirements=None, is_row_processor=False +): + requirements = [f"cloudpickle=={cloudpickle.__version__}"] + if is_row_processor: + # bigframes remote function will send an entire row of data as json, + # which would be converted to a pandas series and processed + # Ensure numpy versions match to avoid unpickling problems. See + # internal issue b/347934471. + requirements.append(f"numpy=={numpy.__version__}") + requirements.append(f"pandas=={pandas.__version__}") + requirements.append(f"pyarrow=={pyarrow.__version__}") + + if package_requirements: + requirements.extend(package_requirements) + + requirements = sorted(requirements) + return requirements + + +def _clean_up_by_session_id( + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + dataset: bigquery.DatasetReference, + session_id: str, +): + """Delete remote function artifacts for a session id, where the session id + was not necessarily created in the current runtime. This is useful if the + user worked with a BigQuery DataFrames session previously and remembered the + session id, and now wants to clean up its temporary resources at a later + point in time. + """ + + # First clean up the BQ remote functions and then the underlying + # cloud functions, so that at no point we are left with a remote function + # that is pointing to a cloud function that does not exist + + endpoints_to_be_deleted: Set[str] = set() + match_prefix = "".join( + [ + _BIGFRAMES_REMOTE_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + session_id, + _BQ_FUNCTION_NAME_SEPERATOR, + ] + ) + for routine in bqclient.list_routines(dataset): + routine = cast(bigquery.Routine, routine) + + # skip past the routines not belonging to the given session id, or + # non-remote-function routines + if ( + routine.type_ != bigquery.RoutineType.SCALAR_FUNCTION + or not cast(str, routine.routine_id).startswith(match_prefix) + or not routine.remote_function_options + or not routine.remote_function_options.endpoint + ): + continue + + # Let's forgive the edge case possibility that the BQ remote function + # may have been deleted at the same time directly by the user + bqclient.delete_routine(routine, not_found_ok=True) + endpoints_to_be_deleted.add(routine.remote_function_options.endpoint) + + # Now clean up the cloud functions + bq_location = bqclient.get_dataset(dataset).location + bq_location, gcf_location = get_remote_function_locations(bq_location) + parent_path = gcfclient.common_location_path( + project=dataset.project, location=gcf_location + ) + for gcf in gcfclient.list_functions(parent=parent_path): + # skip past the cloud functions not attached to any BQ remote function + # belonging to the given session id + if gcf.service_config.uri not in endpoints_to_be_deleted: + continue + + # Let's forgive the edge case possibility that the cloud function + # may have been deleted at the same time directly by the user + try: + gcfclient.delete_function(name=gcf.name) + except google.api_core.exceptions.NotFound: + pass + + +def _get_hash(def_, package_requirements=None): + "Get hash (32 digits alphanumeric) of a function." + # There is a known cell-id sensitivity of the cloudpickle serialization in + # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of + # this, if a cell contains a udf decorated with @remote_function, a unique + # cloudpickle code is generated every time the cell is run, creating new + # cloud artifacts every time. This is slow and wasteful. + # A workaround of the same can be achieved by replacing the filename in the + # code object to a static value + # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. + # + # To respect the user code/environment let's make this modification on a + # copy of the udf, not on the original udf itself. + def_copy = cloudpickle.loads(cloudpickle.dumps(def_)) + def_copy.__code__ = def_copy.__code__.replace( + co_filename="bigframes_place_holder_filename" + ) + + def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version) + if package_requirements: + for p in sorted(package_requirements): + def_repr += p.encode() + return hashlib.md5(def_repr).hexdigest() + + +def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: + return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" + + +def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): + "Get a name for the cloud function for the given user defined function." + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_hash) + if uniq_suffix: + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_remote_function_name(function_hash, session_id, uniq_suffix=None): + "Get a name for the BQ remote function for the given user defined function." + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] + if uniq_suffix: + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +class IbisSignature(NamedTuple): + parameter_names: List[str] + input_types: List[Optional[ibis.expr.datatypes.core.DataType]] + output_type: ibis.expr.datatypes.core.DataType + + +def ibis_signature_from_python_signature( + signature: inspect.Signature, + input_types: Sequence[type], + output_type: type, +) -> IbisSignature: + + return IbisSignature( + parameter_names=list(signature.parameters.keys()), + input_types=[ + bigframes.core.compile.ibis_types.ibis_type_from_python_type(t) + for t in input_types + ], + output_type=bigframes.core.compile.ibis_types.ibis_type_from_python_type( + output_type + ), + ) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index b3c6aee1b3..7e9df74e76 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -14,664 +14,29 @@ from __future__ import annotations -import collections.abc -import hashlib -import inspect import logging -import os -import random -import shutil -import string -import sys -import tempfile -import threading -from typing import ( - Any, - cast, - Dict, - List, - Mapping, - NamedTuple, - Optional, - Sequence, - Set, - Tuple, - TYPE_CHECKING, - Union, -) +from typing import cast, Optional, TYPE_CHECKING import warnings import ibis -import numpy -import pandas -import pyarrow -import requests if TYPE_CHECKING: from bigframes.session import Session -import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -import cloudpickle import google.api_core.exceptions import google.api_core.retry -from google.cloud import ( - bigquery, - bigquery_connection_v1, - functions_v2, - resourcemanager_v3, -) +from google.cloud import bigquery import google.iam.v1 -from ibis.expr.datatypes.core import DataType as IbisDataType -from bigframes import clients import bigframes.constants as constants import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.functions.remote_function_template -logger = logging.getLogger(__name__) - -# Naming convention for the remote function artifacts -_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" -_BQ_FUNCTION_NAME_SEPERATOR = "_" -_GCF_FUNCTION_NAME_SEPERATOR = "-" - -# Protocol version 4 is available in python version 3.4 and above -# https://docs.python.org/3/library/pickle.html#data-stream-format -_pickle_protocol_version = 4 - - -def _clean_up_by_session_id( - bqclient: bigquery.Client, - gcfclient: functions_v2.FunctionServiceClient, - dataset: bigquery.DatasetReference, - session_id: str, -): - """Delete remote function artifacts for a session id, where the session id - was not necessarily created in the current runtime. This is useful if the - user worked with a BigQuery DataFrames session previously and remembered the - session id, and now wants to clean up its temporary resources at a later - point in time. - """ - - # First clean up the BQ remote functions and then the underlying - # cloud functions, so that at no point we are left with a remote function - # that is pointing to a cloud function that does not exist - - endpoints_to_be_deleted: Set[str] = set() - match_prefix = "".join( - [ - _BIGFRAMES_REMOTE_FUNCTION_PREFIX, - _BQ_FUNCTION_NAME_SEPERATOR, - session_id, - _BQ_FUNCTION_NAME_SEPERATOR, - ] - ) - for routine in bqclient.list_routines(dataset): - routine = cast(bigquery.Routine, routine) - - # skip past the routines not belonging to the given session id, or - # non-remote-function routines - if ( - routine.type_ != bigquery.RoutineType.SCALAR_FUNCTION - or not cast(str, routine.routine_id).startswith(match_prefix) - or not routine.remote_function_options - or not routine.remote_function_options.endpoint - ): - continue - - # Let's forgive the edge case possibility that the BQ remote function - # may have been deleted at the same time directly by the user - bqclient.delete_routine(routine, not_found_ok=True) - endpoints_to_be_deleted.add(routine.remote_function_options.endpoint) - - # Now clean up the cloud functions - bq_location = bqclient.get_dataset(dataset).location - bq_location, gcf_location = get_remote_function_locations(bq_location) - parent_path = gcfclient.common_location_path( - project=dataset.project, location=gcf_location - ) - for gcf in gcfclient.list_functions(parent=parent_path): - # skip past the cloud functions not attached to any BQ remote function - # belonging to the given session id - if gcf.service_config.uri not in endpoints_to_be_deleted: - continue - - # Let's forgive the edge case possibility that the cloud function - # may have been deleted at the same time directly by the user - try: - gcfclient.delete_function(name=gcf.name) - except google.api_core.exceptions.NotFound: - pass - - -def get_remote_function_locations(bq_location): - """Get BQ location and cloud functions region given a BQ client.""" - # TODO(shobs, b/274647164): Find the best way to determine default location. - # For now let's assume that if no BQ location is set in the client then it - # defaults to US multi region - bq_location = bq_location.lower() if bq_location else "us" - - # Cloud function should be in the same region as the bigquery remote function - cloud_function_region = bq_location - - # BigQuery has multi region but cloud functions does not. - # Any region in the multi region that supports cloud functions should work - # https://cloud.google.com/functions/docs/locations - if bq_location == "us": - cloud_function_region = "us-central1" - elif bq_location == "eu": - cloud_function_region = "europe-west1" - - return bq_location, cloud_function_region - - -def _get_hash(def_, package_requirements=None): - "Get hash (32 digits alphanumeric) of a function." - # There is a known cell-id sensitivity of the cloudpickle serialization in - # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of - # this, if a cell contains a udf decorated with @remote_function, a unique - # cloudpickle code is generated every time the cell is run, creating new - # cloud artifacts every time. This is slow and wasteful. - # A workaround of the same can be achieved by replacing the filename in the - # code object to a static value - # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. - # - # To respect the user code/environment let's make this modification on a - # copy of the udf, not on the original udf itself. - def_copy = cloudpickle.loads(cloudpickle.dumps(def_)) - def_copy.__code__ = def_copy.__code__.replace( - co_filename="bigframes_place_holder_filename" - ) - - def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version) - if package_requirements: - for p in sorted(package_requirements): - def_repr += p.encode() - return hashlib.md5(def_repr).hexdigest() - - -def _get_updated_package_requirements( - package_requirements=None, is_row_processor=False -): - requirements = [f"cloudpickle=={cloudpickle.__version__}"] - if is_row_processor: - # bigframes remote function will send an entire row of data as json, - # which would be converted to a pandas series and processed - # Ensure numpy versions match to avoid unpickling problems. See - # internal issue b/347934471. - requirements.append(f"numpy=={numpy.__version__}") - requirements.append(f"pandas=={pandas.__version__}") - requirements.append(f"pyarrow=={pyarrow.__version__}") - - if package_requirements: - requirements.extend(package_requirements) - - requirements = sorted(requirements) - return requirements - - -def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: - return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" - - -class IbisSignature(NamedTuple): - parameter_names: List[str] - input_types: List[Optional[IbisDataType]] - output_type: IbisDataType - - -def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): - "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_hash) - if uniq_suffix: - parts.append(uniq_suffix) - return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_remote_function_name(function_hash, session_id, uniq_suffix=None): - "Get a name for the BQ remote function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] - if uniq_suffix: - parts.append(uniq_suffix) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) - - -class RemoteFunctionClient: - # Wait time (in seconds) for an IAM binding to take effect after creation - _iam_wait_seconds = 120 - - def __init__( - self, - gcp_project_id, - cloud_function_region, - cloud_functions_client, - bq_location, - bq_dataset, - bq_client, - bq_connection_id, - bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - *, - session: Session, - ): - self._gcp_project_id = gcp_project_id - self._cloud_function_region = cloud_function_region - self._cloud_functions_client = cloud_functions_client - self._bq_location = bq_location - self._bq_dataset = bq_dataset - self._bq_client = bq_client - self._bq_connection_id = bq_connection_id - self._bq_connection_manager = bq_connection_manager - self._cloud_function_service_account = cloud_function_service_account - self._cloud_function_kms_key_name = cloud_function_kms_key_name - self._cloud_function_docker_repository = cloud_function_docker_repository - self._session = session - - def create_bq_remote_function( - self, - input_args, - input_types, - output_type, - endpoint, - bq_function_name, - max_batching_rows, - ): - """Create a BigQuery remote function given the artifacts of a user defined - function and the http endpoint of a corresponding cloud function.""" - if self._bq_connection_manager: - self._bq_connection_manager.create_bq_connection( - self._gcp_project_id, - self._bq_location, - self._bq_connection_id, - "run.invoker", - ) - - # Create BQ function - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 - bq_function_args = [] - bq_function_return_type = output_type - - # We are expecting the input type annotations to be 1:1 with the input args - for name, type_ in zip(input_args, input_types): - bq_function_args.append(f"{name} {type_}") - - remote_function_options = { - "endpoint": endpoint, - "max_batching_rows": max_batching_rows, - } - - remote_function_options_str = ", ".join( - [ - f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}" - for key, val in remote_function_options.items() - if val is not None - ] - ) - - create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) - RETURNS {bq_function_return_type} - REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` - OPTIONS ({remote_function_options_str})""" - - logger.info(f"Creating BQ remote function: {create_function_ddl}") - - # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and - # create it - dataset = bigquery.Dataset( - bigquery.DatasetReference.from_string( - self._bq_dataset, default_project=self._gcp_project_id - ) - ) - dataset.location = self._bq_location - try: - # This check does not require bigquery.datasets.create IAM - # permission. So, if the data set already exists, then user can work - # without having that permission. - self._bq_client.get_dataset(dataset) - except google.api_core.exceptions.NotFound: - # This requires bigquery.datasets.create IAM permission - self._bq_client.create_dataset(dataset, exists_ok=True) - - # TODO(swast): plumb through the original, user-facing api_name. - _, query_job = self._session._start_query(create_function_ddl) - logger.info(f"Created remote function {query_job.ddl_target_routine}") - - def get_cloud_function_fully_qualified_parent(self): - "Get the fully qualilfied parent for a cloud function." - return self._cloud_functions_client.common_location_path( - self._gcp_project_id, self._cloud_function_region - ) - - def get_cloud_function_fully_qualified_name(self, name): - "Get the fully qualilfied name for a cloud function." - return self._cloud_functions_client.function_path( - self._gcp_project_id, self._cloud_function_region, name - ) - - def get_remote_function_fully_qualilfied_name(self, name): - "Get the fully qualilfied name for a BQ remote function." - return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" - - def get_cloud_function_endpoint(self, name): - """Get the http endpoint of a cloud function if it exists.""" - fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) - try: - response = self._cloud_functions_client.get_function( - name=fully_qualified_name - ) - return response.service_config.uri - except google.api_core.exceptions.NotFound: - pass - return None - - def generate_cloud_function_code( - self, - def_, - directory, - *, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - is_row_processor=False, - ): - """Generate the cloud function code for a given user defined function. - - Args: - input_types (tuple[str]): - Types of the input arguments in BigQuery SQL data type names. - output_type (str): - Types of the output scalar as a BigQuery SQL data type name. - """ - - # requirements.txt - if package_requirements: - requirements_txt = os.path.join(directory, "requirements.txt") - with open(requirements_txt, "w") as f: - f.write("\n".join(package_requirements)) - - # main.py - entry_point = bigframes.functions.remote_function_template.generate_cloud_function_main_code( - def_, - directory, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, - ) - return entry_point - - def create_cloud_function( - self, - def_, - cf_name, - *, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - timeout_seconds=600, - max_instance_count=None, - is_row_processor=False, - vpc_connector=None, - memory_mib=1024, - ): - """Create a cloud function from the given user defined function. - - Args: - input_types (tuple[str]): - Types of the input arguments in BigQuery SQL data type names. - output_type (str): - Types of the output scalar as a BigQuery SQL data type name. - """ - - # Build and deploy folder structure containing cloud function - with tempfile.TemporaryDirectory() as directory: - entry_point = self.generate_cloud_function_code( - def_, - directory, - package_requirements=package_requirements, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, - ) - archive_path = shutil.make_archive(directory, "zip", directory) - - # We are creating cloud function source code from the currently running - # python version. Use the same version to deploy. This is necessary - # because cloudpickle serialization done in one python version and - # deserialization done in another python version doesn't work. - # TODO(shobs): Figure out how to achieve version compatibility, specially - # when pickle (internally used by cloudpickle) guarantees that: - # https://docs.python.org/3/library/pickle.html#:~:text=The%20pickle%20serialization%20format%20is,unique%20breaking%20change%20language%20boundary. - python_version = "python{}{}".format( - sys.version_info.major, sys.version_info.minor - ) - - # Determine an upload URL for user code - upload_url_request = functions_v2.GenerateUploadUrlRequest( - kms_key_name=self._cloud_function_kms_key_name - ) - upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() - upload_url_response = self._cloud_functions_client.generate_upload_url( - request=upload_url_request - ) - - # Upload the code to GCS - with open(archive_path, "rb") as f: - response = requests.put( - upload_url_response.upload_url, - data=f, - headers={"content-type": "application/zip"}, - ) - if response.status_code != 200: - raise RuntimeError( - "Failed to upload user code. code={}, reason={}, text={}".format( - response.status_code, response.reason, response.text - ) - ) - - # Deploy Cloud Function - create_function_request = functions_v2.CreateFunctionRequest() - create_function_request.parent = ( - self.get_cloud_function_fully_qualified_parent() - ) - create_function_request.function_id = cf_name - function = functions_v2.Function() - function.name = self.get_cloud_function_fully_qualified_name(cf_name) - function.build_config = functions_v2.BuildConfig() - function.build_config.runtime = python_version - function.build_config.entry_point = entry_point - function.build_config.source = functions_v2.Source() - function.build_config.source.storage_source = functions_v2.StorageSource() - function.build_config.source.storage_source.bucket = ( - upload_url_response.storage_source.bucket - ) - function.build_config.source.storage_source.object_ = ( - upload_url_response.storage_source.object_ - ) - function.build_config.docker_repository = ( - self._cloud_function_docker_repository - ) - function.service_config = functions_v2.ServiceConfig() - if memory_mib is not None: - function.service_config.available_memory = f"{memory_mib}Mi" - if timeout_seconds is not None: - if timeout_seconds > 1200: - raise ValueError( - "BigQuery remote function can wait only up to 20 minutes" - ", see for more details " - "https://cloud.google.com/bigquery/quotas#remote_function_limits." - ) - function.service_config.timeout_seconds = timeout_seconds - if max_instance_count is not None: - function.service_config.max_instance_count = max_instance_count - if vpc_connector is not None: - function.service_config.vpc_connector = vpc_connector - function.service_config.service_account_email = ( - self._cloud_function_service_account - ) - function.kms_key_name = self._cloud_function_kms_key_name - create_function_request.function = function - - # Create the cloud function and wait for it to be ready to use - try: - operation = self._cloud_functions_client.create_function( - request=create_function_request - ) - operation.result() - - # Cleanup - os.remove(archive_path) - except google.api_core.exceptions.AlreadyExists: - # If a cloud function with the same name already exists, let's - # update it - update_function_request = functions_v2.UpdateFunctionRequest() - update_function_request.function = function - operation = self._cloud_functions_client.update_function( - request=update_function_request - ) - operation.result() - - # Fetch the endpoint of the just created function - endpoint = self.get_cloud_function_endpoint(cf_name) - if not endpoint: - raise ValueError( - f"Couldn't fetch the http endpoint. {constants.FEEDBACK_LINK}" - ) - - logger.info( - f"Successfully created cloud function {cf_name} with uri ({endpoint})" - ) - return endpoint - - def provision_bq_remote_function( - self, - def_, - input_types, - output_type, - reuse, - name, - package_requirements, - max_batching_rows, - cloud_function_timeout, - cloud_function_max_instance_count, - is_row_processor, - cloud_function_vpc_connector, - cloud_function_memory_mib, - ): - """Provision a BigQuery remote function.""" - # Augment user package requirements with any internal package - # requirements - package_requirements = _get_updated_package_requirements( - package_requirements, is_row_processor - ) - - # Compute a unique hash representing the user code - function_hash = _get_hash(def_, package_requirements) - - # If reuse of any existing function with the same name (indicated by the - # same hash of its source code) is not intended, then attach a unique - # suffix to the intended function name to make it unique. - uniq_suffix = None - if not reuse: - # use 4 digits as a unique suffix which should suffice for - # uniqueness per session - uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=4) - ) - - # Derive the name of the cloud function underlying the intended BQ - # remote function. Use the session id to identify the GCF for unnamed - # functions. The named remote functions are treated as a persistant - # artifacts, so let's keep them independent of session id, which also - # makes their naming more stable for the same udf code - session_id = None if name else self._session.session_id - cloud_function_name = get_cloud_function_name( - function_hash, session_id, uniq_suffix - ) - cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) - - # Create the cloud function if it does not exist - if not cf_endpoint: - cf_endpoint = self.create_cloud_function( - def_, - cloud_function_name, - input_types=input_types, - output_type=output_type, - package_requirements=package_requirements, - timeout_seconds=cloud_function_timeout, - max_instance_count=cloud_function_max_instance_count, - is_row_processor=is_row_processor, - vpc_connector=cloud_function_vpc_connector, - memory_mib=cloud_function_memory_mib, - ) - else: - logger.info(f"Cloud function {cloud_function_name} already exists.") - - # Derive the name of the remote function - remote_function_name = name - if not remote_function_name: - remote_function_name = get_remote_function_name( - function_hash, self._session.session_id, uniq_suffix - ) - rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) - - # Create the BQ remote function in following circumstances: - # 1. It does not exist - # 2. It exists but the existing remote function has different - # configuration than intended - created_new = False - if not rf_endpoint or ( - rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id - ): - input_args = inspect.getargs(def_.__code__).args - if len(input_args) != len(input_types): - raise ValueError( - "Exactly one type should be provided for every input arg." - ) - self.create_bq_remote_function( - input_args, - input_types, - output_type, - cf_endpoint, - remote_function_name, - max_batching_rows, - ) - - created_new = True - else: - logger.info(f"Remote function {remote_function_name} already exists.") - - return remote_function_name, cloud_function_name, created_new +from . import _remote_function_session as rf_session +from . import _utils - def get_remote_function_specs(self, remote_function_name): - """Check whether a remote function already exists for the udf.""" - http_endpoint = None - bq_connection = None - routines = self._bq_client.list_routines( - f"{self._gcp_project_id}.{self._bq_dataset}" - ) - try: - for routine in routines: - routine = cast(bigquery.Routine, routine) - if routine.reference.routine_id == remote_function_name: - rf_options = routine.remote_function_options - if rf_options: - http_endpoint = rf_options.endpoint - bq_connection = rf_options.connection - if bq_connection: - bq_connection = os.path.basename(bq_connection) - break - except google.api_core.exceptions.NotFound: - # The dataset might not exist, in which case the http_endpoint doesn't, either. - # Note: list_routines doesn't make an API request until we iterate on the response object. - pass - return (http_endpoint, bq_connection) +logger = logging.getLogger(__name__) class UnsupportedTypeError(ValueError): @@ -680,34 +45,16 @@ def __init__(self, type_, supported_types): self.supported_types = supported_types -def ibis_signature_from_python_signature( - signature: inspect.Signature, - input_types: Sequence[type], - output_type: type, -) -> IbisSignature: - - return IbisSignature( - parameter_names=list(signature.parameters.keys()), - input_types=[ - bigframes.core.compile.ibis_types.ibis_type_from_python_type(t) - for t in input_types - ], - output_type=bigframes.core.compile.ibis_types.ibis_type_from_python_type( - output_type - ), - ) - - class ReturnTypeMissingError(ValueError): pass # TODO: Move this to compile folder -def ibis_signature_from_routine(routine: bigquery.Routine) -> IbisSignature: +def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature: if not routine.return_type: raise ReturnTypeMissingError - return IbisSignature( + return _utils.IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ bigframes.core.compile.ibis_types.ibis_type_from_type_kind( @@ -748,515 +95,12 @@ def get_routine_reference( return dataset_ref.routine(routine_ref_str) -class _RemoteFunctionSession: - """Session to manage remote functions.""" - - def __init__(self): - # Session level mapping of remote function artifacts - self._temp_artifacts: Dict[str, str] = dict() - - # Lock to synchronize the update of the session artifacts - self._artifacts_lock = threading.Lock() - - def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): - """Update remote function artifacts in the current session.""" - with self._artifacts_lock: - self._temp_artifacts[bqrf_routine] = gcf_path - - def clean_up( - self, - bqclient: bigquery.Client, - gcfclient: functions_v2.FunctionServiceClient, - session_id: str, - ): - """Delete remote function artifacts in the current session.""" - with self._artifacts_lock: - for bqrf_routine, gcf_path in self._temp_artifacts.items(): - # Let's accept the possibility that the remote function may have - # been deleted directly by the user - bqclient.delete_routine(bqrf_routine, not_found_ok=True) - - # Let's accept the possibility that the cloud function may have - # been deleted directly by the user - try: - gcfclient.delete_function(name=gcf_path) - except google.api_core.exceptions.NotFound: - pass - - self._temp_artifacts.clear() - - # Inspired by @udf decorator implemented in ibis-bigquery package - # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py - # which has moved as @js to the ibis package - # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py - def remote_function( - self, - input_types: Union[None, type, Sequence[type]] = None, - output_type: Optional[type] = None, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, - bigquery_connection_client: Optional[ - bigquery_connection_v1.ConnectionServiceClient - ] = None, - cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, - resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, - dataset: Optional[str] = None, - bigquery_connection: Optional[str] = None, - reuse: bool = True, - name: Optional[str] = None, - packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, - cloud_function_kms_key_name: Optional[str] = None, - cloud_function_docker_repository: Optional[str] = None, - max_batching_rows: Optional[int] = 1000, - cloud_function_timeout: Optional[int] = 600, - cloud_function_max_instances: Optional[int] = None, - cloud_function_vpc_connector: Optional[str] = None, - cloud_function_memory_mib: Optional[int] = 1024, - ): - """Decorator to turn a user defined function into a BigQuery remote function. - - .. deprecated:: 0.0.1 - This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. - - .. note:: - Please make sure following is setup before using this API: - - 1. Have the below APIs enabled for your project: - - * BigQuery Connection API - * Cloud Functions API - * Cloud Run API - * Cloud Build API - * Artifact Registry API - * Cloud Resource Manager API - - This can be done from the cloud console (change `PROJECT_ID` to yours): - https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - - Or from the gcloud CLI: - - `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` - - 2. Have following IAM roles enabled for you: - - * BigQuery Data Editor (roles/bigquery.dataEditor) - * BigQuery Connection Admin (roles/bigquery.connectionAdmin) - * Cloud Functions Developer (roles/cloudfunctions.developer) - * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` - * Storage Object Viewer (roles/storage.objectViewer) - * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) - - 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - - 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection - 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - - Alternatively, the IAM could also be setup via the gcloud CLI: - - `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. - - Args: - input_types (None, type, or sequence(type)): - For scalar user defined function it should be the input type or - sequence of input types. For row processing user defined function, - type `Series` should be specified. - output_type (Optional[type]): - Data type of the output in the user defined function. - session (bigframes.Session, Optional): - BigQuery DataFrames session to use for getting default project, - dataset and BigQuery connection. - bigquery_client (google.cloud.bigquery.Client, Optional): - Client to use for BigQuery operations. If this param is not provided - then bigquery client from the session would be used. - bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): - Client to use for BigQuery connection operations. If this param is - not provided then bigquery connection client from the session would - be used. - cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): - Client to use for cloud functions operations. If this param is not - provided then the functions client from the session would be used. - resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): - Client to use for cloud resource management operations, e.g. for - getting and setting IAM roles on cloud resources. If this param is - not provided then resource manager client from the session would be - used. - dataset (str, Optional.): - Dataset in which to create a BigQuery remote function. It should be in - `.` or `` format. If this - parameter is not provided then session dataset id is used. - bigquery_connection (str, Optional): - Name of the BigQuery connection in the form of `CONNECTION_ID` or - `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. - If this param is not provided then the bigquery connection from the session - would be used. If it is pre created in the same location as the - `bigquery_client.location` then it would be used, otherwise it is created - dynamically using the `bigquery_connection_client` assuming the user has necessary - priviliges. The PROJECT_ID should be the same as the BigQuery connection project. - reuse (bool, Optional): - Reuse the remote function if already exists. - `True` by default, which will result in reusing an existing remote - function and corresponding cloud function that was previously - created (if any) for the same udf. - Please note that for an unnamed (i.e. created without an explicit - `name` argument) remote function, the BigQuery DataFrames - session id is attached in the cloud artifacts names. So for the - effective reuse across the sessions it is recommended to create - the remote function with an explicit `name`. - Setting it to `False` would force creating a unique remote function. - If the required remote function does not exist then it would be - created irrespective of this param. - name (str, Optional): - Explicit name of the persisted BigQuery remote function. Use it with - caution, because two users working in the same project and dataset - could overwrite each other's remote functions if they use the same - persistent name. When an explicit name is provided, any session - specific clean up (``bigframes.session.Session.close``/ - ``bigframes.pandas.close_session``/ - ``bigframes.pandas.reset_session``/ - ``bigframes.pandas.clean_up_by_session_id``) does not clean up - the function, and leaves it for the user to manage the function - and the associated cloud function directly. - packages (str[], Optional): - Explicit name of the external package dependencies. Each dependency - is added to the `requirements.txt` as is, and can be of the form - supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided then - the default service account would be used. See - https://cloud.google.com/functions/docs/securing/function-identity - for more details. Please make sure the service account has the - necessary IAM permissions configured as described in - https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. - cloud_function_kms_key_name (str, Optional): - Customer managed encryption key to protect cloud functions and - related data at rest. This is of the format - projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. - Read https://cloud.google.com/functions/docs/securing/cmek for - more details including granting necessary service accounts - access to the key. - cloud_function_docker_repository (str, Optional): - Docker repository created with the same encryption key as - `cloud_function_kms_key_name` to store encrypted artifacts - created to support the cloud function. This is of the format - projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. - For more details see - https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. - max_batching_rows (int, Optional): - The maximum number of rows to be batched for processing in the - BQ remote function. Default value is 1000. A lower number can be - passed to avoid timeouts in case the user code is too complex to - process large number of rows fast enough. A higher number can be - used to increase throughput in case the user code is fast enough. - `None` can be passed to let BQ remote functions service apply - default batching. See for more details - https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. - cloud_function_timeout (int, Optional): - The maximum amount of time (in seconds) BigQuery should wait for - the cloud function to return a response. See for more details - https://cloud.google.com/functions/docs/configuring/timeout. - Please note that even though the cloud function (2nd gen) itself - allows seeting up to 60 minutes of timeout, BigQuery remote - function can wait only up to 20 minutes, see for more details - https://cloud.google.com/bigquery/quotas#remote_function_limits. - By default BigQuery DataFrames uses a 10 minute timeout. `None` - can be passed to let the cloud functions default timeout take effect. - cloud_function_max_instances (int, Optional): - The maximumm instance count for the cloud function created. This - can be used to control how many cloud function instances can be - active at max at any given point of time. Lower setting can help - control the spike in the billing. Higher setting can help - support processing larger scale data. When not specified, cloud - function's default setting applies. For more details see - https://cloud.google.com/functions/docs/configuring/max-instances. - cloud_function_vpc_connector (str, Optional): - The VPC connector you would like to configure for your cloud - function. This is useful if your code needs access to data or - service(s) that are on a VPC network. See for more details - https://cloud.google.com/functions/docs/networking/connecting-vpc. - cloud_function_memory_mib (int, Optional): - The amounts of memory (in mebibytes) to allocate for the cloud - function (2nd gen) created. This also dictates a corresponding - amount of allocated CPU for the function. By default a memory of - 1024 MiB is set for the cloud functions created to support - BigQuery DataFrames remote function. If you want to let the - default memory of cloud functions be allocated, pass `None`. See - for more details - https://cloud.google.com/functions/docs/configuring/memory. - """ - # Some defaults may be used from the session if not provided otherwise - import bigframes.exceptions as bf_exceptions - import bigframes.pandas as bpd - import bigframes.series as bf_series - import bigframes.session - - session = cast(bigframes.session.Session, session or bpd.get_global_session()) - - # A BigQuery client is required to perform BQ operations - if not bigquery_client: - bigquery_client = session.bqclient - if not bigquery_client: - raise ValueError( - "A bigquery client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A BigQuery connection client is required to perform BQ connection operations - if not bigquery_connection_client: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: - raise ValueError( - "A bigquery connection client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A cloud functions client is required to perform cloud functions operations - if not cloud_functions_client: - cloud_functions_client = session.cloudfunctionsclient - if not cloud_functions_client: - raise ValueError( - "A cloud functions client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A resource manager client is required to get/set IAM operations - if not resource_manager_client: - resource_manager_client = session.resourcemanagerclient - if not resource_manager_client: - raise ValueError( - "A resource manager client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # BQ remote function must be persisted, for which we need a dataset - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. - if dataset: - dataset_ref = bigquery.DatasetReference.from_string( - dataset, default_project=bigquery_client.project - ) - else: - dataset_ref = session._anonymous_dataset - - bq_location, cloud_function_region = get_remote_function_locations( - bigquery_client.location - ) - - # A connection is required for BQ remote function - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function - if not bigquery_connection: - bigquery_connection = session._bq_connection # type: ignore - - bigquery_connection = clients.resolve_full_bq_connection_name( - bigquery_connection, - default_project=dataset_ref.project, - default_location=bq_location, - ) - # Guaranteed to be the form of .. - ( - gcp_project_id, - bq_connection_location, - bq_connection_id, - ) = bigquery_connection.split(".") - if gcp_project_id.casefold() != dataset_ref.project.casefold(): - raise ValueError( - "The project_id does not match BigQuery connection gcp_project_id: " - f"{dataset_ref.project}." - ) - if bq_connection_location.casefold() != bq_location.casefold(): - raise ValueError( - "The location does not match BigQuery connection location: " - f"{bq_location}." - ) - - # If any CMEK is intended then check that a docker repository is also specified - if ( - cloud_function_kms_key_name is not None - and cloud_function_docker_repository is None - ): - raise ValueError( - "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." - " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" - ) - - bq_connection_manager = session.bqconnectionmanager - - def wrapper(func): - nonlocal input_types, output_type - - if not callable(func): - raise TypeError("f must be callable, got {}".format(func)) - - if sys.version_info >= (3, 10): - # Add `eval_str = True` so that deferred annotations are turned into their - # corresponding type objects. Need Python 3.10 for eval_str parameter. - # https://docs.python.org/3/library/inspect.html#inspect.signature - signature_kwargs: Mapping[str, Any] = {"eval_str": True} - else: - signature_kwargs = {} - - signature = inspect.signature( - func, - **signature_kwargs, - ) - - # Try to get input types via type annotations. - if input_types is None: - input_types = [] - for parameter in signature.parameters.values(): - if (param_type := parameter.annotation) is inspect.Signature.empty: - raise ValueError( - "'input_types' was not set and parameter " - f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function." - ) - input_types.append(param_type) - elif not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - - if output_type is None: - if ( - output_type := signature.return_annotation - ) is inspect.Signature.empty: - raise ValueError( - "'output_type' was not set and function is missing a " - "return type annotation. Types are required to use " - "@remote_function." - ) - - # The function will actually be receiving a pandas Series, but allow both - # BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if len(input_types) == 1 and ( - (input_type := input_types[0]) == bf_series.Series - or input_type == pandas.Series - ): - warnings.warn( - "input_types=Series is in preview.", - stacklevel=1, - category=bf_exceptions.PreviewWarning, - ) - - # we will model the row as a json serialized string containing the data - # and the metadata representing the row - input_types = [str] - is_row_processor = True - elif isinstance(input_types, type): - input_types = [input_types] - - # TODO(b/340898611): fix type error - ibis_signature = ibis_signature_from_python_signature( - signature, input_types, output_type # type: ignore - ) - - remote_function_client = RemoteFunctionClient( - dataset_ref.project, - cloud_function_region, - cloud_functions_client, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_id, - bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - session=session, # type: ignore - ) - - # In the unlikely case where the user is trying to re-deploy the same - # function, cleanup the attributes we add below, first. This prevents - # the pickle from having dependencies that might not otherwise be - # present such as ibis or pandas. - def try_delattr(attr): - try: - delattr(func, attr) - except AttributeError: - pass - - try_delattr("bigframes_cloud_function") - try_delattr("bigframes_remote_function") - try_delattr("input_dtypes") - try_delattr("output_dtype") - try_delattr("is_row_processor") - try_delattr("ibis_node") - - ( - rf_name, - cf_name, - created_new, - ) = remote_function_client.provision_bq_remote_function( - func, - input_types=tuple( - third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) - for type_ in ibis_signature.input_types - ), - output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( - ibis_signature.output_type - ), - reuse=reuse, - name=name, - package_requirements=packages, - max_batching_rows=max_batching_rows, - cloud_function_timeout=cloud_function_timeout, - cloud_function_max_instance_count=cloud_function_max_instances, - is_row_processor=is_row_processor, - cloud_function_vpc_connector=cloud_function_vpc_connector, - cloud_function_memory_mib=cloud_function_memory_mib, - ) - - # TODO: Move ibis logic to compiler step - node = ibis.udf.scalar.builtin( - func, - name=rf_name, - schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", - signature=(ibis_signature.input_types, ibis_signature.output_type), - ) - func.bigframes_cloud_function = ( - remote_function_client.get_cloud_function_fully_qualified_name(cf_name) - ) - func.bigframes_remote_function = ( - remote_function_client.get_remote_function_fully_qualilfied_name( - rf_name - ) - ) - func.input_dtypes = tuple( - [ - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - input_type - ) - for input_type in ibis_signature.input_types - ] - ) - func.output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type - ) - ) - func.is_row_processor = is_row_processor - func.ibis_node = node - - # If a new remote function was created, update the cloud artifacts - # created in the session. This would be used to clean up any - # resources in the session. Note that we need to do this only for - # the case where an explicit name was not provided by the user and - # we used an internal name. For the cases where the user provided an - # explicit name, we are assuming that the user wants to persist them - # with that name and would directly manage their lifecycle. - if created_new and (not name): - self._update_temp_artifacts( - func.bigframes_remote_function, func.bigframes_cloud_function - ) - return func - - return wrapper - - def remote_function(*args, **kwargs): - remote_function_session = _RemoteFunctionSession() + remote_function_session = rf_session.RemoteFunctionSession() return remote_function_session.remote_function(*args, **kwargs) -remote_function.__doc__ = _RemoteFunctionSession.remote_function.__doc__ +remote_function.__doc__ = rf_session.RemoteFunctionSession.remote_function.__doc__ def read_gbq_function( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 21f75eb82c..08d808572d 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -63,7 +63,7 @@ import bigframes.core.tools import bigframes.dataframe import bigframes.enums -import bigframes.functions.remote_function as bigframes_rf +import bigframes.functions._utils as functions_utils import bigframes.operations as ops import bigframes.series import bigframes.session @@ -817,7 +817,7 @@ def clean_up_by_session_id( session.bqclient, dataset, session_id ) - bigframes_rf._clean_up_by_session_id( + functions_utils._clean_up_by_session_id( session.bqclient, session.cloudfunctionsclient, dataset, session_id ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2da788292b..8cef869a32 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -95,6 +95,7 @@ import bigframes.dtypes import bigframes.exceptions import bigframes.formatting_helpers as formatting_helpers +import bigframes.functions._remote_function_session as bigframes_rf_session import bigframes.functions.remote_function as bigframes_rf import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table @@ -316,7 +317,7 @@ def __init__( ) self._allow_ambiguity = not self._strictly_ordered - self._remote_function_session = bigframes_rf._RemoteFunctionSession() + self._remote_function_session = bigframes_rf_session.RemoteFunctionSession() @property def bqclient(self): diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 095f7059cd..d6eefc1e31 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -31,7 +31,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions -import bigframes.functions.remote_function as bigframes_rf +import bigframes.functions._utils as functions_utils import bigframes.pandas as bpd import bigframes.series from tests.system.utils import ( @@ -595,9 +595,11 @@ def add_one(x): add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) # Expected cloud function name for the unique udf - package_requirements = bigframes_rf._get_updated_package_requirements() - add_one_uniq_hash = bigframes_rf._get_hash(add_one_uniq, package_requirements) - add_one_uniq_cf_name = bigframes_rf.get_cloud_function_name( + package_requirements = functions_utils._get_updated_package_requirements() + add_one_uniq_hash = functions_utils._get_hash( + add_one_uniq, package_requirements + ) + add_one_uniq_cf_name = functions_utils.get_cloud_function_name( add_one_uniq_hash, session.session_id ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 8ecf9eb368..db573efa40 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -23,6 +23,7 @@ import bigframes import bigframes.dtypes import bigframes.exceptions +from bigframes.functions import _utils as rf_utils from bigframes.functions import remote_function as rf from tests.system.utils import assert_pandas_df_equal @@ -89,12 +90,12 @@ def get_rf_name(func, package_requirements=None, is_row_processor=False): """Get a remote function name for testing given a udf.""" # Augment user package requirements with any internal package # requirements - package_requirements = rf._get_updated_package_requirements( + package_requirements = rf_utils._get_updated_package_requirements( package_requirements, is_row_processor ) # Compute a unique hash representing the user code - function_hash = rf._get_hash(func, package_requirements) + function_hash = rf_utils._get_hash(func, package_requirements) return f"bigframes_{function_hash}" @@ -714,7 +715,7 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} - routine_ref_str = rf.routine_ref_to_string_for_query(routine.reference) + routine_ref_str = rf_utils.routine_ref_to_string_for_query(routine.reference) direct_sql = " UNION ALL ".join( [f"SELECT {x} AS x, {routine_ref_str}({x}) AS y" for x in src["x"]] ) diff --git a/tests/system/utils.py b/tests/system/utils.py index 9fbf191a3a..e9054d04c9 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -26,7 +26,7 @@ import pyarrow as pa # type: ignore import pytest -from bigframes.functions import remote_function +import bigframes.functions._utils as functions_utils import bigframes.pandas ML_REGRESSION_METRICS = [ @@ -340,7 +340,7 @@ def get_cloud_functions( not name or not name_prefix ), "Either 'name' or 'name_prefix' can be passed but not both." - _, location = remote_function.get_remote_function_locations(location) + _, location = functions_utils.get_remote_function_locations(location) parent = f"projects/{project}/locations/{location}" request = functions_v2.ListFunctionsRequest(parent=parent) page_result = functions_client.list_functions(request=request) From 6dff860758bd5de08f0692703f27906e1efbe7e6 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:15:52 -0700 Subject: [PATCH 13/15] chore: update owlbot script to prevent silent failures at s.replace (#889) * chore: update owlbot script to prevent silent failures at s.replace * fix errors * removing s.replace for CONTRIBUTING.rst because it was excluded from templated files --- owlbot.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/owlbot.py b/owlbot.py index f9d9410d6d..b29384d462 100644 --- a/owlbot.py +++ b/owlbot.py @@ -61,7 +61,7 @@ # ---------------------------------------------------------------------------- # Encourage sharring all relevant versions in bug reports. -s.replace( +assert 1 == s.replace( [".github/ISSUE_TEMPLATE/bug_report.md"], re.escape("#### Steps to reproduce\n"), textwrap.dedent( @@ -90,7 +90,7 @@ ) # Make sure build includes all necessary files. -s.replace( +assert 1 == s.replace( ["MANIFEST.in"], re.escape("recursive-include google"), "recursive-include third_party/bigframes_vendored *\nrecursive-include bigframes", @@ -98,7 +98,7 @@ # Even though BigQuery DataFrames isn't technically a client library, we are # opting into Cloud RAD for docs hosting. -s.replace( +assert 1 == s.replace( [".kokoro/docs/common.cfg"], re.escape('value: "docs-staging-v2-staging"'), 'value: "docs-staging-v2"', @@ -106,7 +106,7 @@ # Use a custom table of contents since the default one isn't organized well # enough for the number of classes we have. -s.replace( +assert 1 == s.replace( [".kokoro/publish-docs.sh"], ( re.escape("# upload docs") @@ -124,19 +124,12 @@ ) # Fixup the documentation. -s.replace( +assert 1 == s.replace( ["docs/conf.py"], re.escape("Google Cloud Client Libraries for bigframes"), "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", ) -# Update the contributing guide to reflect some differences in this repo. -s.replace( - ["CONTRIBUTING.rst"], - re.escape("blacken"), - "format", -) - # ---------------------------------------------------------------------------- # Samples templates # ---------------------------------------------------------------------------- From e027b7e9d29f628d058611106014a1790459958c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:24:21 -0700 Subject: [PATCH 14/15] feat: Series.str.__getitem__ (#897) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [X] Make sure to open an issue as internal issue: 358459166 - [X] Ensure the tests and linter pass - [X] Code coverage does not decrease (if any source code was changed) - [X] Appropriate docs were updated (if necessary) Fixes internal issue: 358459166 🦕 --- bigframes/core/compile/scalar_op_compiler.py | 20 ++- bigframes/operations/__init__.py | 34 ++++ bigframes/operations/strings.py | 27 +++ tests/system/small/operations/test_strings.py | 157 +++++++++++++++--- .../pandas/core/strings/accessor.py | 31 ++++ 5 files changed, 241 insertions(+), 28 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 32749b32a6..e70c49e337 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -902,6 +902,24 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) +@scalar_op_compiler.register_unary_op(ops.ArrayIndexOp, pass_op=True) +def array_index_op_impl(x: ibis_types.Value, op: ops.ArrayIndexOp): + res = typing.cast(ibis_types.ArrayValue, x)[op.index] + if x.type().is_string(): + return _null_or_value(res, res != ibis.literal("")) + else: + return res + + +@scalar_op_compiler.register_unary_op(ops.ArraySliceOp, pass_op=True) +def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): + res = typing.cast(ibis_types.ArrayValue, x)[op.start : op.stop : op.step] + if x.type().is_string(): + return _null_or_value(res, res != ibis.literal("")) + else: + return res + + # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): @@ -984,7 +1002,7 @@ def ne_op( def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): - return ibis.where( + return ibis.ifelse( where_value, value, ibis.null(), diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 4d4e40643d..fb333d7a53 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -602,6 +602,40 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +@dataclasses.dataclass(frozen=True) +class ArrayIndexOp(UnaryOp): + name: typing.ClassVar[str] = "array_index" + index: int + + def output_type(self, *input_types): + input_type = input_types[0] + if dtypes.is_string_like(input_type): + return dtypes.STRING_DTYPE + elif dtypes.is_array_like(input_type): + return dtypes.arrow_dtype_to_bigframes_dtype( + input_type.pyarrow_dtype.value_type + ) + else: + raise TypeError("Input type must be an array or string-like type.") + + +@dataclasses.dataclass(frozen=True) +class ArraySliceOp(UnaryOp): + name: typing.ClassVar[str] = "array_slice" + start: int + stop: typing.Optional[int] = None + step: typing.Optional[int] = None + + def output_type(self, *input_types): + input_type = input_types[0] + if dtypes.is_string_like(input_type): + return dtypes.STRING_DTYPE + elif dtypes.is_array_like(input_type): + return input_type + else: + raise TypeError("Input type must be an array or string-like type.") + + ## JSON Ops @dataclasses.dataclass(frozen=True) class JSONExtract(UnaryOp): diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 22c325d7e0..d3e9c7edc6 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -38,6 +38,33 @@ class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMethods): __doc__ = vendorstr.StringMethods.__doc__ + def __getitem__(self, key: Union[int, slice]) -> series.Series: + if isinstance(key, int): + if key < 0: + raise NotImplementedError("Negative indexing is not supported.") + return self._apply_unary_op(ops.ArrayIndexOp(index=key)) + elif isinstance(key, slice): + if key.step is not None and key.step != 1: + raise NotImplementedError( + f"Only a step of 1 is allowed, got {key.step}" + ) + if (key.start is not None and key.start < 0) or ( + key.stop is not None and key.stop < 0 + ): + raise NotImplementedError( + "Slicing with negative numbers is not allowed." + ) + + return self._apply_unary_op( + ops.ArraySliceOp( + start=key.start if key.start is not None else 0, + stop=key.stop, + step=key.step, + ) + ) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + def find( self, sub: str, diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index b8a8ad2d1e..3191adf920 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -14,10 +14,13 @@ import re +import packaging.version import pandas as pd +import pyarrow as pa import pytest -import bigframes.series +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd from ...utils import assert_series_equal @@ -25,7 +28,7 @@ def test_find(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.find("W").to_pandas() pd_result = scalars_pandas_df[col_name].str.find("W") @@ -50,7 +53,7 @@ def test_find(scalars_dfs): def test_str_contains(scalars_dfs, pat, case, flags, regex): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.contains( pat, case=case, flags=flags, regex=regex @@ -72,7 +75,7 @@ def test_str_contains(scalars_dfs, pat, case, flags, regex): def test_str_extract(scalars_dfs, pat): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.extract(pat).to_pandas() pd_result = scalars_pandas_df[col_name].str.extract(pat) @@ -101,7 +104,7 @@ def test_str_extract(scalars_dfs, pat): def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.replace( pat, repl=repl, case=case, flags=flags, regex=regex @@ -132,7 +135,7 @@ def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): def test_str_startswith(scalars_dfs, pat): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name].astype("object") bf_result = bf_series.str.startswith(pat).to_pandas() @@ -157,7 +160,7 @@ def test_str_startswith(scalars_dfs, pat): def test_str_endswith(scalars_dfs, pat): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name].astype("object") bf_result = bf_series.str.endswith(pat).to_pandas() @@ -169,7 +172,7 @@ def test_str_endswith(scalars_dfs, pat): def test_len(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.len().to_pandas() pd_result = scalars_pandas_df[col_name].str.len() @@ -188,7 +191,7 @@ def test_len_with_array_column(nested_df, nested_pandas_df): See: https://stackoverflow.com/a/41340543/101923 """ col_name = "event_sequence" - bf_series: bigframes.series.Series = nested_df[col_name] + bf_series: bpd.Series = nested_df[col_name] bf_result = bf_series.str.len().to_pandas() pd_result = nested_pandas_df[col_name].str.len() @@ -204,7 +207,7 @@ def test_len_with_array_column(nested_df, nested_pandas_df): def test_lower(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.lower().to_pandas() pd_result = scalars_pandas_df[col_name].str.lower() @@ -217,7 +220,7 @@ def test_lower(scalars_dfs): def test_reverse(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.reverse().to_pandas() pd_result = scalars_pandas_df[col_name].copy() for i in pd_result.index: @@ -239,7 +242,7 @@ def test_reverse(scalars_dfs): def test_slice(scalars_dfs, start, stop): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.slice(start, stop).to_pandas() pd_series = scalars_pandas_df[col_name] pd_result = pd_series.str.slice(start, stop) @@ -253,7 +256,7 @@ def test_slice(scalars_dfs, start, stop): def test_strip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.strip().to_pandas() pd_result = scalars_pandas_df[col_name].str.strip() @@ -266,7 +269,7 @@ def test_strip(scalars_dfs): def test_upper(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.upper().to_pandas() pd_result = scalars_pandas_df[col_name].str.upper() @@ -375,7 +378,7 @@ def test_isupper(weird_strings, weird_strings_pd): def test_rstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.rstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.rstrip() @@ -388,7 +391,7 @@ def test_rstrip(scalars_dfs): def test_lstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.lstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.lstrip() @@ -402,7 +405,7 @@ def test_lstrip(scalars_dfs): def test_repeat(scalars_dfs, repeats): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.repeat(repeats).to_pandas() pd_result = scalars_pandas_df[col_name].str.repeat(repeats) @@ -415,7 +418,7 @@ def test_repeat(scalars_dfs, repeats): def test_capitalize(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.capitalize().to_pandas() pd_result = scalars_pandas_df[col_name].str.capitalize() @@ -428,9 +431,9 @@ def test_capitalize(scalars_dfs): def test_cat_with_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_filter: bigframes.series.Series = scalars_df["bool_col"] - bf_left: bigframes.series.Series = scalars_df[col_name][bf_filter] - bf_right: bigframes.series.Series = scalars_df[col_name] + bf_filter: bpd.Series = scalars_df["bool_col"] + bf_left: bpd.Series = scalars_df[col_name][bf_filter] + bf_right: bpd.Series = scalars_df[col_name] bf_result = bf_left.str.cat(others=bf_right).to_pandas() pd_filter = scalars_pandas_df["bool_col"] pd_left = scalars_pandas_df[col_name][pd_filter] @@ -447,7 +450,7 @@ def test_str_match(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" pattern = "[A-Z].*" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.match(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.match(pattern) @@ -461,7 +464,7 @@ def test_str_fullmatch(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" pattern = "[A-Z].*!" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.fullmatch(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.fullmatch(pattern) @@ -474,7 +477,7 @@ def test_str_fullmatch(scalars_dfs): def test_str_get(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.get(8).to_pandas() pd_result = scalars_pandas_df[col_name].str.get(8) @@ -487,7 +490,7 @@ def test_str_get(scalars_dfs): def test_str_pad(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.pad(8, side="both", fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.pad(8, side="both", fillchar="%") @@ -510,7 +513,7 @@ def test_str_zfill(weird_strings, weird_strings_pd): def test_str_ljust(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.ljust(7, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.ljust(7, fillchar="%") @@ -523,7 +526,7 @@ def test_str_ljust(scalars_dfs): def test_str_rjust(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.rjust(9, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.rjust(9, fillchar="%") @@ -562,3 +565,103 @@ def test_str_split_raise_errors(scalars_dfs, pat, regex): pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x) assert_series_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("index"), + [ + pytest.param( + "first", id="invalid_type", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param( + -1, id="neg_index", marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + slice(0, 2, 2), + id="only_allow_one_step", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + slice(-1, None, None), + id="neg_slicing", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_getitem_raise_errors(scalars_dfs, index): + scalars_df, _ = scalars_dfs + col_name = "string_col" + scalars_df[col_name].str[index] + + +@pytest.mark.parametrize( + ("index"), + [ + pytest.param(2, id="int"), + pytest.param(slice(None, None, None), id="default_start_slice"), + pytest.param(slice(0, None, 1), id="default_stop_slice"), + pytest.param(slice(0, 2, None), id="default_step_slice"), + ], +) +def test_getitem_w_string(scalars_dfs, index): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].str[index].to_pandas() + pd_result = scalars_pandas_df[col_name].str[index] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("index"), + [ + pytest.param(2, id="int"), + pytest.param(slice(None, None, None), id="default_start_slice"), + pytest.param(slice(0, None, 1), id="default_stop_slice"), + pytest.param(slice(0, 2, None), id="default_step_slice"), + pytest.param(slice(0, 0, None), id="single_one_slice"), + ], +) +def test_getitem_w_array(index): + data = [[1], [2, 3], [], [4, 5, 6]] + s = bpd.Series(data) + pd_s = pd.Series(data) + + bf_result = s.str[index].to_pandas() + pd_result = pd_s.str[index] + # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +def test_getitem_w_struct_array(): + if packaging.version.Version(pd.__version__) <= packaging.version.Version("1.5.0"): + pytest.skip("https://github.com/googleapis/python-bigquery/issues/1992") + + pa_struct = pa.struct( + [ + ("name", pa.string()), + ("age", pa.int64()), + ] + ) + data: list[list[dict]] = [ + [ + {"name": "Alice", "age": 30}, + {"name": "Bob", "age": 25}, + ], + [ + {"name": "Charlie", "age": 35}, + {"name": "David", "age": 40}, + {"name": "Eva", "age": 28}, + ], + [], + [{"name": "Frank", "age": 50}], + ] + s = bpd.Series(data, dtype=bpd.ArrowDtype(pa.list_(pa_struct))) + + result = s.str[1] + assert dtypes.is_struct_like(result.dtype) + + expected_data = [item[1] if len(item) > 1 else None for item in data] + expected = bpd.Series(expected_data, dtype=bpd.ArrowDtype((pa_struct))) + + assert_series_equal(result.to_pandas(), expected.to_pandas()) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index b02c23f945..bd5e78f415 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -13,6 +13,37 @@ class StringMethods: R's stringr package. """ + def __getitem__(self, key: typing.Union[int, slice]): + """ + Index or slice string or list in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) + >>> s.str[0] + 0 A + 1 B + 2 C + dtype: string + + >>> s.str[0:3] + 0 Ali + 1 Bob + 2 Cha + dtype: string + + Args: + key (int | slice): + Index or slice of indices to access from each string or list. + + Returns: + bigframes.series.Series: The list at requested index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def extract(self, pat: str, flags: int = 0): """ Extract capture groups in the regex `pat` as columns in a DataFrame. From ae07274ea3b49f0350da77c3f8fdb44e4cda6778 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 19:21:06 -0700 Subject: [PATCH 15/15] chore(main): release 1.14.0 (#882) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3209391f44..754658c5e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,32 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.14.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.13.0...v1.14.0) (2024-08-14) + + +### Features + +* Implement `bigframes.bigquery.json_extract` ([#868](https://github.com/googleapis/python-bigquery-dataframes/issues/868)) ([3dbf84b](https://github.com/googleapis/python-bigquery-dataframes/commit/3dbf84bd1531c1f8d41ba57c2c38b3ba6abfb812)) +* Implement `Series.str.__getitem__` ([#897](https://github.com/googleapis/python-bigquery-dataframes/issues/897)) ([e027b7e](https://github.com/googleapis/python-bigquery-dataframes/commit/e027b7e9d29f628d058611106014a1790459958c)) + + +### Bug Fixes + +* Fix caching from generating row numbers in partial ordering mode ([#872](https://github.com/googleapis/python-bigquery-dataframes/issues/872)) ([52b7786](https://github.com/googleapis/python-bigquery-dataframes/commit/52b7786c3a28da6c29e3ddf12629802215194ad9)) + + +### Performance Improvements + +* Generate SQL with fewer CTEs ([#877](https://github.com/googleapis/python-bigquery-dataframes/issues/877)) ([eb60804](https://github.com/googleapis/python-bigquery-dataframes/commit/eb6080460344aff2fabb7864536ea4fe24c5fbef)) +* Speed up compilation by reducing redundant type normalization ([#896](https://github.com/googleapis/python-bigquery-dataframes/issues/896)) ([e0b11bc](https://github.com/googleapis/python-bigquery-dataframes/commit/e0b11bc8c038db7b950b1653ed4cd44a6246c713)) + + +### Documentation + +* Add streaming html docs ([#884](https://github.com/googleapis/python-bigquery-dataframes/issues/884)) ([171da6c](https://github.com/googleapis/python-bigquery-dataframes/commit/171da6cb33165b49d46ea6528038342abd89e9fa)) +* Fix the `DisplayOptions` doc rendering ([#893](https://github.com/googleapis/python-bigquery-dataframes/issues/893)) ([3eb6a17](https://github.com/googleapis/python-bigquery-dataframes/commit/3eb6a17a5823faf5ecba92cb9a554df74477871d)) +* Update streaming notebook ([#887](https://github.com/googleapis/python-bigquery-dataframes/issues/887)) ([6e6f9df](https://github.com/googleapis/python-bigquery-dataframes/commit/6e6f9df55d435afe0b3ade728ca06826e92a6ee6)) + ## [1.13.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.12.0...v1.13.0) (2024-08-05) diff --git a/bigframes/version.py b/bigframes/version.py index b474f021d4..2e135689ed 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.13.0" +__version__ = "1.14.0"