From 443db228375da9b232376140c9d5b0db14895eae Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 7 Feb 2024 13:18:15 -0800 Subject: [PATCH 01/10] feat: add `Series.cov` method (#368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # ๐Ÿฆ• --- bigframes/core/__init__.py | 22 ------------------- bigframes/core/blocks.py | 19 +++++++++------- bigframes/core/compile/aggregate_compiler.py | 13 +++++++++++ bigframes/operations/aggregations.py | 5 +++++ bigframes/operations/base.py | 7 ++++-- bigframes/series.py | 9 +++++--- tests/system/small/test_series.py | 13 ++++++++++- tests/unit/test_core.py | 20 ----------------- .../bigframes_vendored/pandas/core/series.py | 21 ++++++++++++++++++ 9 files changed, 73 insertions(+), 56 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index ea169dbb74..4dc2e4d7af 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -261,28 +261,6 @@ def aggregate( ) ) - def corr_aggregate( - self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] - ) -> ArrayValue: - """ - Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. - This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. - Arguments: - corr_aggregations: left_column_id, right_column_id, output_column_id tuples - """ - aggregations = tuple( - ( - ex.BinaryAggregation( - agg_ops.CorrOp(), ex.free_var(agg[0]), ex.free_var(agg[1]) - ), - agg[2], - ) - for agg in corr_aggregations - ) - return ArrayValue( - nodes.AggregateNode(child=self.node, aggregations=aggregations) - ) - def project_window_op( self, column_name: str, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e758e20335..34df7231cc 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1040,26 +1040,29 @@ def get_stat(self, column_id: str, stat: agg_ops.UnaryAggregateOp): self._stats_cache[column_id].update(stats_map) return stats_map[stat.name] - def get_corr_stat(self, column_id_left: str, column_id_right: str): + def get_binary_stat( + self, column_id_left: str, column_id_right: str, stat: agg_ops.BinaryAggregateOp + ): # TODO(kemppeterson): Clean up the column names for DataFrames.corr support # TODO(kemppeterson): Add a cache here. - corr_aggregations = [ + aggregations = [ ( - column_id_left, - column_id_right, - "corr_" + column_id_left + column_id_right, + ex.BinaryAggregation( + stat, ex.free_var(column_id_left), ex.free_var(column_id_right) + ), + f"{stat.name}_{column_id_left}{column_id_right}", ) ] - expr = self.expr.corr_aggregate(corr_aggregations) + expr = self.expr.aggregate(aggregations) offset_index_id = guid.generate_guid() expr = expr.promote_offsets(offset_index_id) block = Block( expr, index_columns=[offset_index_id], - column_labels=[a[2] for a in corr_aggregations], + column_labels=[a[1] for a in aggregations], ) df, _ = block.to_pandas() - return df.loc[0, "corr_" + column_id_left + column_id_right] + return df.loc[0, f"{stat.name}_{column_id_left}{column_id_right}"] def summarize( self, diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 0dbc0e7310..1dad128599 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -431,6 +431,19 @@ def _( return cast(ibis_types.NumericColumn, bq_corr) +@compile_binary_agg.register +def _( + op: agg_ops.CovOp, left: ibis_types.Column, right: ibis_types.Column, window=None +) -> ibis_types.NumericValue: + # Will be null if all inputs are null. Pandas defaults to zero sum though. + left_numeric = cast(ibis_types.NumericColumn, left) + right_numeric = cast(ibis_types.NumericColumn, right) + bq_cov = _apply_window_if_present( + left_numeric.cov(right_numeric, how="sample"), window + ) + return cast(ibis_types.NumericColumn, bq_cov) + + def _apply_window_if_present(value: ibis_types.Value, window): return value.over(window) if (window is not None) else value diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index aed05e287b..6301ece865 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -265,6 +265,11 @@ class CorrOp(BinaryAggregateOp): name: ClassVar[str] = "corr" +@dataclasses.dataclass(frozen=True) +class CovOp(BinaryAggregateOp): + name: ClassVar[str] = "cov" + + sum_op = SumOp() mean_op = MeanOp() median_op = MedianOp() diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 4aad9479e7..04114b43cb 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -25,6 +25,7 @@ import bigframes.core.scalar as scalars import bigframes.dtypes import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops import bigframes.series as series import bigframes.session import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -188,10 +189,12 @@ def _apply_binary_op( block, result_id = self._block.project_expr(expr, name) return series.Series(block.select_column(result_id)) - def _apply_corr_aggregation(self, other: series.Series) -> float: + def _apply_binary_aggregation( + self, other: series.Series, stat: agg_ops.BinaryAggregateOp + ) -> float: (left, right, block) = self._align(other, how="outer") - return block.get_corr_stat(left, right) + return block.get_binary_stat(left, right, stat) def _align(self, other: series.Series, how="outer") -> tuple[str, str, blocks.Block]: # type: ignore """Aligns the series value with another scalar or series object. Returns new left column id, right column id and joined tabled expression.""" diff --git a/bigframes/series.py b/bigframes/series.py index 6167ce0966..753e195e0a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -734,8 +734,8 @@ def round(self, decimals=0) -> "Series": return self._apply_binary_op(decimals, ops.round_op) def corr(self, other: Series, method="pearson", min_periods=None) -> float: - # TODO(kemppeterson): Validate early that both are numeric - # TODO(kemppeterson): Handle partially-numeric columns + # TODO(tbergeron): Validate early that both are numeric + # TODO(tbergeron): Handle partially-numeric columns if method != "pearson": raise NotImplementedError( f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}" @@ -744,7 +744,10 @@ def corr(self, other: Series, method="pearson", min_periods=None) -> float: raise NotImplementedError( f"min_periods not yet supported. {constants.FEEDBACK_LINK}" ) - return self._apply_corr_aggregation(other) + return self._apply_binary_aggregation(other, agg_ops.CorrOp()) + + def cov(self, other: Series) -> float: + return self._apply_binary_aggregation(other, agg_ops.CovOp()) def all(self) -> bool: return typing.cast(bool, self._apply_aggregation(agg_ops.all_op)) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5d8fb0b29c..f2790d190a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -656,7 +656,7 @@ def test_mods(scalars_dfs, col_x, col_y, method): # We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this # manually with dumb self-correlation instead of parameterized as test_mods is above. -def test_corr(scalars_dfs): +def test_series_corr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"]) pd_result = ( @@ -667,6 +667,17 @@ def test_corr(scalars_dfs): assert math.isclose(pd_result, bf_result) +def test_series_cov(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .cov(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + @pytest.mark.parametrize( ("col_x",), [ diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index a7f45efc85..42cbcbbc9f 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -208,23 +208,3 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): assert actual.columns[0] == "col1" assert actual.columns[1] == "col4" assert expr.columns[1].type().is_int64() - - -def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.corr_aggregate( - corr_aggregations=[("col1", "col3", "col4")] - )._compile_ordered() - actual = expr._to_ibis_expr(ordering_mode="unordered") - assert len(expr.columns) == 1 - assert actual.columns[0] == "col4" - assert expr.columns[0].type().is_float64() diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 9e376c713e..4232d3ec2a 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -842,6 +842,27 @@ def corr(self, other, method="pearson", min_periods=None) -> float: """ raise NotImplementedError("abstract method") + def cov( + self, + other, + ) -> float: + """ + Compute covariance with Series, excluding missing values. + + The two `Series` objects are not required to be the same length and + will be aligned internally before the covariance is calculated. + + Args: + other (Series): + Series with which to compute the covariance. + + Returns: + float: + Covariance between Series and other normalized by N-1 + (unbiased estimator). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def diff(self) -> Series: """ First discrete difference of element. From d5518b28509be0ce070b22d9134a6a662412010a Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:10:15 -0800 Subject: [PATCH 02/10] feat: add XGBoostModel (#363) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #321809936 ๐Ÿฆ• --- README.rst | 2 + bigframes/ml/core.py | 30 +++++++ bigframes/ml/globals.py | 11 +++ bigframes/ml/imported.py | 117 ++++++++++++++++++++++++- bigframes/ml/loader.py | 2 + bigframes/ml/remote.py | 12 +-- bigframes/ml/sql.py | 18 ++++ docs/templates/toc.yml | 2 + tests/system/small/ml/conftest.py | 40 +++++++++ tests/system/small/ml/test_imported.py | 42 ++++++++- tests/unit/ml/test_sql.py | 24 +++++ 11 files changed, 287 insertions(+), 13 deletions(-) diff --git a/README.rst b/README.rst index 38f7d0ab7a..a977a9f31e 100644 --- a/README.rst +++ b/README.rst @@ -172,6 +172,8 @@ Create estimators for imported models by using the `bigframes.ml.imported module to import Open Neural Network Exchange (ONNX) models. * Use the `TensorFlowModel class `_ to import TensorFlow models. +* Use the `XGBoostModel class `_ + to import XGBoostModel models. **Linear models** diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 266ab1b058..51fed05901 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -355,3 +355,33 @@ def create_imported_model( ) return self._create_model_with_sql(session=session, sql=sql) + + def create_xgboost_imported_model( + self, + session: bigframes.Session, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML imported model with the CREATE OR REPLACE MODEL statement + + Args: + input: + input schema for imported xgboost models + output: + output schema for imported xgboost models + options: a dict of options to configure the model. Generates a BQML OPTIONS + clause + + Returns: a BqmlModel, wrapping a trained model in BigQuery + """ + model_ref = self._create_model_ref(session._anonymous_dataset) + + sql = self._model_creation_sql_generator.create_xgboost_imported_model( + model_ref=model_ref, + input=input, + output=output, + options=options, + ) + + return self._create_model_with_sql(session=session, sql=sql) diff --git a/bigframes/ml/globals.py b/bigframes/ml/globals.py index c139476daa..44e9463727 100644 --- a/bigframes/ml/globals.py +++ b/bigframes/ml/globals.py @@ -19,6 +19,17 @@ _BASE_SQL_GENERATOR = sql.BaseSqlGenerator() _BQML_MODEL_FACTORY = core.BqmlModelFactory() +_SUPPORTED_DTYPES = ( + "bool", + "string", + "int64", + "float64", + "array", + "array", + "array", + "array", +) + def base_sql_generator() -> sql.BaseSqlGenerator: """Base SQL Generator.""" diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 4ae0a8ea4d..e2be154703 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -16,13 +16,14 @@ from __future__ import annotations -from typing import cast, Optional, Union +from typing import cast, Mapping, Optional, Union from google.cloud import bigquery import bigframes from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils +from bigframes.ml.globals import _SUPPORTED_DTYPES import bigframes.pandas as bpd @@ -176,3 +177,117 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) + + +@log_adapter.class_logger +class XGBoostModel(base.Predictor): + """Imported XGBoost model. + + .. warning:: + + Imported XGBoost models have the several limitations. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations + + Args: + session (BigQuery Session): + BQ session to create the model + input (Dict, default None): + Specify the model input schema information when you + create the XGBoost model. The input should be the format of + {field_name: field_type}. Input is optional only if feature_names + and feature_types are both specified in the model file. Supported types + are "bool", "string", "int64", "float64", "array", "array", "array", "array". + output (Dict, default None): + Specify the model output schema information when you + create the XGBoost model. The input should be the format of + {field_name: field_type}. Output is optional only if feature_names + and feature_types are both specified in the model file. Supported types + are "bool", "string", "int64", "float64", "array", "array", "array", "array". + model_path (str): + Cloud Storage path that holds the model files.""" + + def __init__( + self, + session: Optional[bigframes.Session] = None, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, + model_path: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() + self.model_path = model_path + self.input = input + self.output = output + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + + def _create_bqml_model(self): + options = {"model_type": "XGBOOST", "model_path": self.model_path} + + if not self.input and not self.output: + return self._bqml_model_factory.create_imported_model( + session=self.session, options=options + ) + else: + for io in (self.input, self.output): + for v in io.values(): + if v not in _SUPPORTED_DTYPES: + raise ValueError( + f"field_type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}." + ) + + return self._bqml_model_factory.create_xgboost_imported_model( + session=self.session, + input=self.input, + output=self.output, + options=options, + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> XGBoostModel: + assert model.model_type == "XGBOOST" + + xgboost_model = cls(session=session, model_path=None) + xgboost_model._bqml_model = core.BqmlModel(session, model) + return xgboost_model + + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, schema is defined by the model. + + Returns: + bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.predict(X) + + def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + XGBoostModel: saved model.""" + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 4ffde43543..99a31922d8 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -45,6 +45,7 @@ "RANDOM_FOREST_CLASSIFIER": ensemble.RandomForestClassifier, "TENSORFLOW": imported.TensorFlowModel, "ONNX": imported.ONNXModel, + "XGBOOST": imported.XGBoostModel, } ) @@ -72,6 +73,7 @@ def from_bq( ensemble.RandomForestClassifier, imported.TensorFlowModel, imported.ONNXModel, + imported.XGBoostModel, llm.PaLM2TextGenerator, llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 8da073802d..44fde4f32f 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -23,19 +23,9 @@ from bigframes import clients from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils +from bigframes.ml.globals import _SUPPORTED_DTYPES import bigframes.pandas as bpd -_SUPPORTED_DTYPES = ( - "bool", - "string", - "int64", - "float64", - "array", - "array", - "array", - "array", -) - _REMOTE_MODEL_STATUS = "remote_model_status" diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 152f881ec0..f726317bb4 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -193,6 +193,24 @@ def create_imported_model( parts.append(self.options(**options)) return "\n".join(parts) + def create_xgboost_imported_model( + self, + model_ref: google.cloud.bigquery.ModelReference, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> str: + """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" + + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] + if input: + parts.append(self.input(**input)) + if output: + parts.append(self.output(**output)) + if options: + parts.append(self.options(**options)) + return "\n".join(parts) + class ModelManipulationSqlGenerator(BaseSqlGenerator): """Sql generator for manipulating a model entity. Model name is the full model path of project_id.dataset_id.model_id.""" diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b680a5fc1a..d387a513a1 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -91,6 +91,8 @@ uid: bigframes.ml.imported.ONNXModel - name: TensorFlowModel uid: bigframes.ml.imported.TensorFlowModel + - name: XGBoostModel + uid: bigframes.ml.imported.XGBoostModel name: imported - items: - name: Overview diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index e3180d2892..422ea6f1f4 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -190,11 +190,29 @@ def onnx_iris_pandas_df(): ) +@pytest.fixture(scope="session") +def xgboost_iris_pandas_df(): + """Data matching the iris dataset.""" + return pd.DataFrame( + { + "sepal_length": [4.9, 5.1, 34.7], + "sepal_width": [3.0, 5.1, 24.7], + "petal_length": [1.4, 1.5, 13.3], + "petal_width": [0.4, 0.2, 18.3], + } + ) + + @pytest.fixture(scope="session") def onnx_iris_df(session, onnx_iris_pandas_df): return session.read_pandas(onnx_iris_pandas_df) +@pytest.fixture(scope="session") +def xgboost_iris_df(session, xgboost_iris_pandas_df): + return session.read_pandas(xgboost_iris_pandas_df) + + @pytest.fixture(scope="session") def llm_text_df(session, llm_text_pandas_df): return session.read_pandas(llm_text_pandas_df) @@ -322,6 +340,11 @@ def imported_onnx_model_path() -> str: return "gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx" +@pytest.fixture(scope="session") +def imported_xgboost_array_model_path() -> str: + return "gs://bigframes-dev-testing/xgboost-testdata/model.bst" + + @pytest.fixture(scope="session") def imported_tensorflow_model( session, imported_tensorflow_model_path @@ -346,3 +369,20 @@ def imported_onnx_model(session, imported_onnx_model_path) -> imported.ONNXModel session=session, model_path=imported_onnx_model_path, ) + + +@pytest.fixture(scope="session") +def imported_xgboost_model( + session, imported_xgboost_array_model_path +) -> imported.XGBoostModel: + return imported.XGBoostModel( + session=session, + input={ + "petal_length": "float64", + "petal_width": "float64", + "sepal_length": "float64", + "sepal_width": "float64", + }, + output={"predicted_label": "float64"}, + model_path=imported_xgboost_array_model_path, + ) diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index 8ffd9924e9..2b8d04c3ae 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -70,7 +70,7 @@ def test_onnx_create_model(imported_onnx_model): def test_onnx_create_model_default_session(imported_onnx_model_path): - model = imported.TensorFlowModel(model_path=imported_onnx_model_path) + model = imported.ONNXModel(model_path=imported_onnx_model_path) assert model is not None @@ -100,3 +100,43 @@ def test_onnx_model_to_gbq(imported_onnx_model: imported.ONNXModel, dataset_id: imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model", replace=True) with pytest.raises(google.api_core.exceptions.Conflict): imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model") + + +def test_xgboost_create_model(imported_xgboost_model): + # Model creation doesn't return error + assert imported_xgboost_model is not None + + +def test_xgboost_create_model_default_session(imported_xgboost_array_model_path): + model = imported.XGBoostModel(model_path=imported_xgboost_array_model_path) + assert model is not None + + +def test_xgboost_model_predict(imported_xgboost_model, xgboost_iris_df): + predictions = imported_xgboost_model.predict(xgboost_iris_df).to_pandas() + assert predictions.shape == (3, 5) + result = predictions[["predicted_label"]] + value1 = np.array([0.00362173, 0.01580198, 0.98057634]) + value2 = np.array([0.00349651, 0.00999565, 0.98650789]) + value3 = np.array([0.00561748, 0.0108124, 0.98357016]) + expected = pd.DataFrame( + { + "predicted_label": [value1, value2, value3], + }, + index=pd.Index([0, 1, 2], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + check_dtype=False, + atol=0.1, + ) + + +def test_xgboost_model_to_gbq( + imported_xgboost_model: imported.XGBoostModel, dataset_id: str +): + imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model", replace=True) + with pytest.raises(google.api_core.exceptions.Conflict): + imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model") diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 37cc33d33e..de80dad710 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -234,6 +234,30 @@ def test_create_imported_model_produces_correct_sql( ) +def test_create_xgboost_imported_model_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_xgboost_imported_model( + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_xgboost_imported_model" + ), + input={"column1": "int64"}, + output={"result": "array"}, + options={"option_key1": "option_value1", "option_key2": 2}, + ) + assert ( + sql + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_xgboost_imported_model` +INPUT( + column1 int64) +OUTPUT( + result array) +OPTIONS( + option_key1="option_value1", + option_key2=2)""" + ) + + def test_alter_model_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): From 930cf6b9ae8a48f422586dbd21b52e15c9ef9492 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 7 Feb 2024 18:10:18 -0800 Subject: [PATCH 03/10] fix: error message fix. (#375) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # ๐Ÿฆ• --- bigframes/dataframe.py | 4 ++-- bigframes/functions/remote_function.py | 2 +- bigframes/series.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2729d23701..9db567a497 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1532,7 +1532,7 @@ def _filter_rows( ) -> DataFrame: if len(self._block.index_columns) > 1: raise NotImplementedError( - "Method filter does not support rows multiindex. {constants.FEEDBACK_LINK}" + f"Method filter does not support rows multiindex. {constants.FEEDBACK_LINK}" ) if (like is not None) or (regex is not None): block = self._block @@ -1760,7 +1760,7 @@ def dropna( ) -> DataFrame: if inplace: raise NotImplementedError( - "'inplace'=True not supported. {constants.FEEDBACK_LINK}" + f"'inplace'=True not supported. {constants.FEEDBACK_LINK}" ) if how not in ("any", "all"): raise ValueError("'how' must be one of 'any', 'all'") diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index dfffbe65ac..c7bb5d92c6 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -880,7 +880,7 @@ def read_gbq_function( ibis_signature = ibis_signature_from_routine(routine) except ReturnTypeMissingError: raise ValueError( - "Function return type must be specified. {constants.FEEDBACK_LINK}" + f"Function return type must be specified. {constants.FEEDBACK_LINK}" ) except bigframes.dtypes.UnsupportedTypeError as e: raise ValueError( diff --git a/bigframes/series.py b/bigframes/series.py index 753e195e0a..74580989f3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1242,7 +1242,7 @@ def filter( ) if len(self._block.index_columns) > 1: raise NotImplementedError( - "Method filter does not support rows multiindex. {constants.FEEDBACK_LINK}" + f"Method filter does not support rows multiindex. {constants.FEEDBACK_LINK}" ) if (like is not None) or (regex is not None): block = self._block From 99a9e6e15c6eef4297035ce89bb619f8e4ca54ff Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 8 Feb 2024 21:24:16 +0000 Subject: [PATCH 04/10] docs: clarify ADC pre-auth in a non-interactive environment (#348) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 321859738 ๐Ÿฆ• --- README.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index a977a9f31e..08322f51f2 100644 --- a/README.rst +++ b/README.rst @@ -27,10 +27,13 @@ Prerequisites * Install the ``bigframes`` package. * Create a Google Cloud project and billing account. -* When running locally, authenticate with application default credentials. See - the `gcloud auth application-default login - `_ - reference. +* In an interactive environment (like Notebook, Python REPL or command line), + ``bigframes`` will do the authentication on-the-fly if needed. Otherwise, see + `how to set up application default credentials `_ + for various environments. For example, to pre-authenticate on your laptop you can + `install and initialize the gcloud CLI `_, + and then generate the application default credentials by doing + `gcloud auth application-default login `_. Code sample ^^^^^^^^^^^ @@ -286,9 +289,6 @@ created by BigQuery DataFrames are prefixed by ``bigframes``. **Requirements** -BigQuery DataFrames uses the ``gcloud`` command-line interface internally, -so you must run ``gcloud auth login`` before using remote functions. - To use BigQuery DataFrames remote functions, you must enable the following APIs: * The BigQuery API (bigquery.googleapis.com) From de1e0a451785e679f37b083be6d58c267319f56a Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 9 Feb 2024 11:56:18 -0800 Subject: [PATCH 05/10] feat: add ml.llm.GeminiTextGenerator model (#370) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # ๐Ÿฆ• --- README.rst | 2 + bigframes/ml/llm.py | 174 ++++++++++++++++++++++++++++++ bigframes/ml/loader.py | 1 + docs/templates/toc.yml | 2 + tests/system/small/ml/conftest.py | 5 + tests/system/small/ml/test_llm.py | 39 +++++++ 6 files changed, 223 insertions(+) diff --git a/README.rst b/README.rst index 08322f51f2..1323a065d8 100644 --- a/README.rst +++ b/README.rst @@ -194,6 +194,8 @@ Create estimators for linear models by using the `bigframes.ml.linear_model modu Create estimators for LLMs by using the `bigframes.ml.llm module `_. +* Use the `GeminiTextGenerator class `_ to create Gemini text generator models. Use these models + for text generation tasks. * Use the `PaLM2TextGenerator class `_ to create PaLM2 text generator models. Use these models for text generation tasks. * Use the `PaLM2TextEmbeddingGenerator class `_ to create PaLM2 text embedding generator models. diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 3607000323..f44310f6a5 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -41,6 +41,8 @@ _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, ) +_GEMINI_PRO_ENDPOINT = "gemini-pro" + _ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" _ML_EMBED_TEXT_STATUS = "ml_embed_text_status" @@ -396,3 +398,175 @@ def to_gbq( new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) + + +@log_adapter.class_logger +class GeminiTextGenerator(base.Predictor): + """Gemini text generator LLM model. + + Args: + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach + permission if the connection isn't fully setup. + """ + + def __init__( + self, + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() + self._bq_connection_manager = clients.BqConnectionManager( + self.session.bqconnectionclient, self.session.resourcemanagerclient + ) + + connection_name = connection_name or self.session._bq_connection + self.connection_name = self._bq_connection_manager.resolve_full_connection_name( + connection_name, + default_project=self.session._project, + default_location=self.session._location, + ) + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + + options = {"endpoint": _GEMINI_PRO_ENDPOINT} + + return self._bqml_model_factory.create_remote_model( + session=self.session, connection_name=self.connection_name, options=options + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> GeminiTextGenerator: + assert model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in model._properties + assert "connection" in model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + model_connection = model._properties["remoteModelInfo"]["connection"] + + text_generator_model = cls(session=session, connection_name=model_connection) + text_generator_model._bqml_model = core.BqmlModel(session, model) + return text_generator_model + + def predict( + self, + X: Union[bpd.DataFrame, bpd.Series], + temperature: float = 0.9, + max_output_tokens: int = 8192, + top_k: int = 40, + top_p: float = 1.0, + ) -> bpd.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, which contains only one column of prompts. + Prompts can include preamble, questions, suggestions, instructions, or examples. + + temperature (float, default 0.9): + The temperature is used for sampling during the response generation, which occurs when topP and topK are applied. Temperature controls the degree of randomness in token selection. Lower temperatures are good for prompts that require a more deterministic and less open-ended or creative response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic: the highest probability response is always selected. + Default 0.9. Possible values [0.0, 1.0]. + + max_output_tokens (int, default 8192): + Maximum number of tokens that can be generated in the response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. + Specify a lower value for shorter responses and a higher value for potentially longer responses. + Default 8192. Possible values are in the range [1, 8192]. + + top_k (int, default 40): + Top-K changes how the model selects tokens for output. A top-K of 1 means the next selected token is the most probable among all tokens in the model's vocabulary (also called greedy decoding), while a top-K of 3 means that the next token is selected from among the three most probable tokens by using temperature. + For each token selection step, the top-K tokens with the highest probabilities are sampled. Then tokens are further filtered based on top-P with the final token selected using temperature sampling. + Specify a lower value for less random responses and a higher value for more random responses. + Default 40. Possible values [1, 40]. + + top_p (float, default 0.95):: + Top-P changes how the model selects tokens for output. Tokens are selected from the most (see top-K) to least probable until the sum of their probabilities equals the top-P value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-P value is 0.5, then the model will select either A or B as the next token by using temperature and excludes C as a candidate. + Specify a lower value for less random responses and a higher value for more random responses. + Default 1.0. Possible values [0.0, 1.0]. + + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + + # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models + if temperature < 0.0 or temperature > 1.0: + raise ValueError(f"temperature must be [0.0, 1.0], but is {temperature}.") + + if max_output_tokens not in range(1, 8193): + raise ValueError( + f"max_output_token must be [1, 8192] for Gemini model, but is {max_output_tokens}." + ) + + if top_k not in range(1, 41): + raise ValueError(f"top_k must be [1, 40], but is {top_k}.") + + if top_p < 0.0 or top_p > 1.0: + raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") + + (X,) = utils.convert_to_dataframe(X) + + if len(X.columns) != 1: + raise ValueError( + f"Only support one column as input. {constants.FEEDBACK_LINK}" + ) + + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "prompt"}) + + options = { + "temperature": temperature, + "max_output_tokens": max_output_tokens, + "top_k": top_k, + "top_p": top_p, + "flatten_json_output": True, + } + + df = self._bqml_model.generate_text(X, options) + + if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df + + def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + GeminiTextGenerator: saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 99a31922d8..dafda43e9d 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -55,6 +55,7 @@ llm._TEXT_GENERATOR_BISON_32K_ENDPOINT: llm.PaLM2TextGenerator, llm._EMBEDDING_GENERATOR_GECKO_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, llm._EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, + llm._GEMINI_PRO_ENDPOINT: llm.GeminiTextGenerator, } ) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index d387a513a1..e921279418 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -105,6 +105,8 @@ - items: - name: Overview uid: bigframes.ml.llm + - name: GeminiTextGenerator + uid: bigframes.ml.llm.GeminiTextGenerator - name: PaLM2TextGenerator uid: bigframes.ml.llm.PaLM2TextGenerator - name: PaLM2TextEmbeddingGenerator diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 422ea6f1f4..8bf08906f9 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -267,6 +267,11 @@ def palm2_embedding_generator_multilingual_model( ) +@pytest.fixture(scope="session") +def gemini_text_generator_model(session, bq_connection) -> llm.GeminiTextGenerator: + return llm.GeminiTextGenerator(session=session, connection_name=bq_connection) + + @pytest.fixture(scope="session") def linear_remote_model_params() -> dict: # Pre-deployed endpoint of linear reg model in Vertex. diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 805cee4fec..db959b854e 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -272,3 +272,42 @@ def test_embedding_generator_predict_series_success( series = df["text_embedding"] value = series[0] assert len(value) == 768 + + +def test_create_gemini_text_generator_model( + gemini_text_generator_model, dataset_id, bq_connection +): + # Model creation doesn't return error + assert gemini_text_generator_model is not None + assert gemini_text_generator_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = gemini_text_generator_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + + +@pytest.mark.flaky(retries=2, delay=120) +def test_gemini_text_generator_predict_default_params_success( + gemini_text_generator_model, llm_text_df +): + df = gemini_text_generator_model.predict(llm_text_df).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_gemini_text_generator_predict_with_params_success( + gemini_text_generator_model, llm_text_df +): + df = gemini_text_generator_model.predict( + llm_text_df, temperature=0.5, max_output_tokens=100, top_k=20, top_p=0.5 + ).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) From ffb0d15602fe4d86e7a1aad72bba0a7049193a14 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 12 Feb 2024 10:12:10 -0800 Subject: [PATCH 06/10] feat: Support bigframes.pandas.to_datetime for scalars, iterables and series. (#372) * feat: Support pd.to_datetime for scalars, iterables and series. * update test and docstring * update types * format update * remove import. * update docstring * update arg conversion * update examples * update format * update code examples, and working logic. * docstring update. * type update. * format update. * Update docstring format * remove import * remove empty line * Remove extra code * remove prints. * Code logic updates. * Add constants. * Update comments * Move datetime helpers to the end of file. * Update helper * update format * String process logic updated. * update import * remove print * update docstring * update docstring * update docstring * update note * update docstring * Update code examples --- bigframes/core/compile/scalar_op_compiler.py | 43 ++++++++++ bigframes/core/tools/__init__.py | 19 +++++ bigframes/core/tools/datetimes.py | 82 +++++++++++++++++++ bigframes/operations/__init__.py | 11 +++ bigframes/pandas/__init__.py | 28 +++++++ tests/system/small/test_pandas.py | 62 ++++++++++++++ .../pandas/core/tools/__init__.py | 0 .../pandas/core/tools/datetimes.py | 77 +++++++++++++++++ 8 files changed, 322 insertions(+) create mode 100644 bigframes/core/tools/__init__.py create mode 100644 bigframes/core/tools/datetimes.py create mode 100644 third_party/bigframes_vendored/pandas/core/tools/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/tools/datetimes.py diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index bf0755acc7..3bcdd70581 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -40,6 +40,17 @@ # ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. _FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) +# Datetime constants +UNIT_TO_US_CONVERSION_FACTORS = { + "D": 24 * 60 * 60 * 1000 * 1000, + "h": 60 * 60 * 1000 * 1000, + "m": 60 * 1000 * 1000, + "s": 1000 * 1000, + "ms": 1000, + "us": 1, + "ns": 1e-3, +} + class ScalarOpCompiler: # Mapping of operation name to implemenations @@ -656,6 +667,33 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): return x.isin(matchable_ibis_values) +@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) +def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): + if x.type() == ibis_dtypes.str: + x = x.to_timestamp(op.format) if op.format else timestamp(x) + elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): + return x + elif x.type() != ibis_dtypes.timestamp: + # The default unit is set to "ns" (nanoseconds) for consistency + # with pandas, where "ns" is the default unit for datetime operations. + unit = op.unit or "ns" + if unit not in UNIT_TO_US_CONVERSION_FACTORS: + raise ValueError(f"Cannot convert input with unit '{unit}'.") + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] + x_converted = x_converted.cast(ibis_dtypes.int64) + + # Note: Due to an issue where casting directly to a timestamp + # without a timezone does not work, we first cast to UTC. This + # approach appears to bypass a potential bug in Ibis's cast function, + # allowing for subsequent casting to a timestamp type without timezone + # information. Further investigation is needed to confirm this behavior. + x = x_converted.to_timestamp(unit="us").cast( + ibis_dtypes.Timestamp(timezone="UTC") + ) + + return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): if not hasattr(op.func, "bigframes_remote_function"): @@ -1141,3 +1179,8 @@ def is_null(value) -> bool: def _ibis_num(number: float): return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) + + +@ibis.udf.scalar.builtin +def timestamp(a: str) -> ibis_dtypes.timestamp: + """Convert string to timestamp.""" diff --git a/bigframes/core/tools/__init__.py b/bigframes/core/tools/__init__.py new file mode 100644 index 0000000000..38563510a7 --- /dev/null +++ b/bigframes/core/tools/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core.tools.datetimes import to_datetime + +__all__ = [ + "to_datetime", +] diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py new file mode 100644 index 0000000000..093fa0a670 --- /dev/null +++ b/bigframes/core/tools/datetimes.py @@ -0,0 +1,82 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Mapping +from datetime import datetime +from typing import Optional, Union + +import pandas as pd + +import bigframes.constants as constants +import bigframes.core.global_session as global_session +import bigframes.dataframe +import bigframes.operations as ops +import bigframes.series +import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes + + +def to_datetime( + arg: Union[ + vendored_pandas_datetimes.local_scalars, + vendored_pandas_datetimes.local_iterables, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ], + *, + utc: bool = False, + format: Optional[str] = None, + unit: Optional[str] = None, +) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: + if isinstance(arg, (int, float, str, datetime)): + return pd.to_datetime( + arg, + utc=utc, + format=format, + unit=unit, + ) + + if isinstance(arg, (Mapping, pd.DataFrame, bigframes.dataframe.DataFrame)): + raise NotImplementedError( + "Conversion of Mapping, pandas.DataFrame, or bigframes.dataframe.DataFrame " + f"to datetime is not implemented. {constants.FEEDBACK_LINK}" + ) + + if not isinstance(arg, bigframes.series.Series): + # This block ensures compatibility with local data formats, including + # iterables and pandas.Series + # TODO: Currently, data upload is performed using pandas DataFrames + # combined with the `read_pandas` method due to the BigFrames DataFrame + # constructor's limitations in handling various data types. Plan to update + # the upload process to utilize the BigFrames DataFrame constructor directly + # once it is enhanced for more related datatypes. + arg = global_session.with_default_session( + bigframes.session.Session.read_pandas, pd.DataFrame(arg) + ) + if len(arg.columns) != 1: + raise ValueError("Input must be 1-dimensional.") + + arg = arg[arg.columns[0]] + + if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore + raise NotImplementedError( + f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}" + ) + + return arg._apply_unary_op( # type: ignore + ops.ToDatetimeOp( + utc=utc, + format=format, + unit=unit, + ) + ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index b40f42a3e8..37188e490e 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -404,6 +404,17 @@ def output_type(self, *input_types): return input_types[0] +@dataclasses.dataclass(frozen=True) +class ToDatetimeOp(UnaryOp): + name: typing.ClassVar[str] = "to_datetime" + utc: bool = False + format: typing.Optional[str] = None + unit: typing.Optional[str] = None + + def output_type(self, *input_types): + return input_types[0] + + # Binary Ops fillna_op = create_binary_op(name="fillna") cliplower_op = create_binary_op(name="clip_lower") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 554acda202..5320e84e21 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -17,6 +17,7 @@ from __future__ import annotations from collections import namedtuple +from datetime import datetime import inspect import sys import typing @@ -52,6 +53,7 @@ import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape +import bigframes.core.tools import bigframes.dataframe import bigframes.operations as ops import bigframes.series @@ -61,6 +63,7 @@ import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile +import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes import third_party.bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq @@ -635,6 +638,30 @@ def read_gbq_function(function_name: str): read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) + +def to_datetime( + arg: Union[ + vendored_pandas_datetimes.local_scalars, + vendored_pandas_datetimes.local_iterables, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ], + *, + utc: bool = False, + format: Optional[str] = None, + unit: Optional[str] = None, +) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: + return bigframes.core.tools.to_datetime( + arg, + utc=utc, + format=format, + unit=unit, + ) + + +to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ + + # pandas dtype attributes NA = pandas.NA BooleanDtype = pandas.BooleanDtype @@ -680,6 +707,7 @@ def read_gbq_function(function_name: str): "read_pandas", "read_pickle", "remote_function", + "to_datetime", # pandas dtype attributes "NA", "BooleanDtype", diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 0910c0b7e2..ec61329aa5 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datetime import datetime + import pandas as pd import pytest +import pytz import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal @@ -477,3 +480,62 @@ def test_qcut(scalars_dfs, q): pd_result = pd_result.astype("Int64") pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("arg", "utc", "unit", "format"), + [ + (173872738, False, None, None), + (32787983.23, True, "s", None), + ("2023-01-01", False, None, "%Y-%m-%d"), + (datetime(2023, 1, 1, 12, 0), False, None, None), + ], +) +def test_to_datetime_scalar(arg, utc, unit, format): + bf_result = bpd.to_datetime(arg, utc=utc, unit=unit, format=format) + pd_result = pd.to_datetime(arg, utc=utc, unit=unit, format=format) + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("arg", "utc", "unit", "format"), + [ + ([173872738], False, None, None), + ([32787983.23], True, "s", None), + ( + [datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York"))], + True, + None, + None, + ), + (["2023-01-01"], True, None, "%Y-%m-%d"), + (["2023-02-01T15:00:00+07:22"], True, None, None), + (["01-31-2023 14:30 -0800"], True, None, "%m-%d-%Y %H:%M %z"), + (["01-31-2023 14:00", "02-01-2023 15:00"], True, None, "%m-%d-%Y %H:%M"), + ], +) +def test_to_datetime_iterable(arg, utc, unit, format): + bf_result = ( + bpd.to_datetime(arg, utc=utc, unit=unit, format=format) + .to_pandas() + .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") + ) + pd_result = pd.Series( + pd.to_datetime(arg, utc=utc, unit=unit, format=format) + ).dt.floor("us") + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) + + +def test_to_datetime_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col = "int64_too" + bf_result = ( + bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") + ) + pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")) + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) diff --git a/third_party/bigframes_vendored/pandas/core/tools/__init__.py b/third_party/bigframes_vendored/pandas/core/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py new file mode 100644 index 0000000000..8a8a562bae --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -0,0 +1,77 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py + +from datetime import datetime +from typing import Iterable, Mapping, Union + +import pandas as pd + +from bigframes import constants, series + +local_scalars = Union[int, float, str, datetime] +local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping] + + +def to_datetime( + arg, + *, + utc=False, + format=None, + unit=None, +) -> Union[pd.Timestamp, datetime, series.Series]: + """ + This function converts a scalar, array-like or Series to a datetime object. + + .. note:: + BigQuery only supports precision up to microseconds (us). Therefore, when working + with timestamps that have a finer granularity than microseconds, be aware that + the additional precision will not be represented in BigQuery. + + .. note:: + The format strings for specifying datetime representations in BigQuery and pandas + are not completely identical. Ensure that the format string provided is compatible + with BigQuery. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Converting a Scalar to datetime: + + >>> scalar = 123456.789 + >>> bpd.to_datetime(scalar, unit = 's') + Timestamp('1970-01-02 10:17:36.789000') + + Converting a List of Strings without Timezone Information: + + >>> list_str = ["01-31-2021 14:30", "02-28-2021 15:45"] + >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M", utc=True) + 0 2021-01-31 14:30:00+00:00 + 1 2021-02-28 15:45:00+00:00 + Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] + + Converting a Series of Strings with Timezone Information: + + >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45+00:00"]) + >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M%Z", utc=True) + 0 2021-01-31 06:30:00+00:00 + 1 2021-02-28 15:45:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + + Args: + arg (int, float, str, datetime, list, tuple, 1-d array, Series): + The object to convert to a datetime. + utc (bool, default False): + Control timezone-related parsing, localization and conversion. If True, the + function always returns a timezone-aware UTC-localized timestamp or series. + If False (default), inputs will not be coerced to UTC. + format (str, default None): + The strftime to parse time, e.g. "%d/%m/%Y". + unit (str, default 'ns'): + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or + float number. + + Returns: + Timestamp, datetime.datetime or bigframes.series.Series: Return type depends on input. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 208e081fa99e17b8085e83c111c07eb6fc5c4730 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 12 Feb 2024 23:16:16 +0000 Subject: [PATCH 07/10] feat: limited support of lamdas in `Series.apply` (#345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEGIN_COMMIT_OVERRIDE feat: limited support of lambdas in `Series.apply` (#345) END_COMMIT_OVERRIDE Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated https://screenshot.googleplex.com/6ZEiKXPz8LWMTRf Partially fixes internal issue 295964341 ๐Ÿฆ• --- bigframes/series.py | 45 +++++- tests/system/small/test_series.py | 151 ++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 82 +++++++++- 3 files changed, 270 insertions(+), 8 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 74580989f3..4aef959a76 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -58,6 +58,12 @@ LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] +_remote_function_recommendation_message = ( + "Your functions could not be applied directly to the Series." + " Try converting it to a remote function." +) + + @log_adapter.class_logger class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): def __init__(self, *args, **kwargs): @@ -1210,12 +1216,43 @@ def _groupby_values( dropna=dropna, ) - def apply(self, func) -> Series: + def apply( + self, func, by_row: typing.Union[typing.Literal["compat"], bool] = "compat" + ) -> Series: # TODO(shobs, b/274645634): Support convert_dtype, args, **kwargs # is actually a ternary op # Reproject as workaround to applying filter too late. This forces the filter # to be applied before passing data to remote function, protecting from bad # inputs causing errors. + + if by_row not in ["compat", False]: + raise ValueError("Param by_row must be one of 'compat' or False") + + if not callable(func): + raise ValueError( + "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." + ) + + if not hasattr(func, "bigframes_remote_function"): + # It is not a remote function + # Then it must be a vectorized function that applies to the Series + # as a whole + if by_row: + raise ValueError( + "A vectorized non-remote function can be provided only with by_row=False." + " For element-wise operation it must be a remote function." + ) + + try: + return func(self) + except Exception as ex: + # This could happen if any of the operators in func is not + # supported on a Series. Let's guide the customer to use a + # remote function instead + if hasattr(ex, "message"): + ex.message += f"\n{_remote_function_recommendation_message}" + raise + reprojected_series = Series(self._block._force_reproject()) return reprojected_series._apply_unary_op( ops.RemoteFunctionOp(func=func, apply_on_null=True) @@ -1325,7 +1362,11 @@ def duplicated(self, keep: str = "first") -> Series: def mask(self, cond, other=None) -> Series: if callable(cond): - cond = self.apply(cond) + if hasattr(cond, "bigframes_remote_function"): + cond = self.apply(cond) + else: + # For non-remote function assume that it is applicable on Series + cond = self.apply(cond, by_row=False) if not isinstance(cond, Series): raise TypeError( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f2790d190a..42651ed96f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2560,6 +2560,51 @@ def test_mask_custom_value(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("lambda_",), + [ + pytest.param(lambda x: x > 0), + pytest.param( + lambda x: True if x > 0 else False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "lambda_arithmatic", + "lambda_arbitrary", + ], +) +def test_mask_lambda(scalars_dfs, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.mask(lambda_).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.mask(lambda_) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_mask_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x < 1000000 + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.mask(foo).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.mask(foo) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("column", "to_type"), [ @@ -3042,3 +3087,109 @@ def test_series_iter( scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"] ): assert bf_i == pd_i + + +@pytest.mark.parametrize( + ( + "col", + "lambda_", + ), + [ + pytest.param("int64_col", lambda x: x * x + x + 1), + pytest.param("int64_col", lambda x: x % 2 == 1), + pytest.param("string_col", lambda x: x + "_suffix"), + ], + ids=[ + "lambda_int_int", + "lambda_int_bool", + "lambda_str_str", + ], +) +def test_apply_lambda(scalars_dfs, col, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df[col] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(lambda_) + + bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() + + pd_col = scalars_pandas_df[col] + pd_result = pd_col.apply(lambda_) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.log), + pytest.param(numpy.sqrt), + pytest.param(numpy.sin), + ], + ids=[ + "log", + "sqrt", + "sin", + ], +) +def test_apply_numpy_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(ufunc) + + bf_result = bf_col.apply(ufunc, by_row=False).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(ufunc) + + assert_series_equal(bf_result, pd_result) + + +def test_apply_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x * x + 2 * x + 3 + + bf_col = scalars_df["int64_col"] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(foo) + + bf_result = bf_col.apply(foo, by_row=False).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(foo) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("col", "lambda_", "exception"), + [ + pytest.param("int64_col", {1: 2, 3: 4}, ValueError), + pytest.param("int64_col", numpy.square, TypeError), + pytest.param("string_col", lambda x: x.capitalize(), AttributeError), + ], + ids=[ + "not_callable", + "numpy_ufunc", + "custom_lambda", + ], +) +def test_apply_not_supported(scalars_dfs, col, lambda_, exception): + scalars_df, _ = scalars_dfs + + bf_col = scalars_df[col] + with pytest.raises(exception): + bf_col.apply(lambda_, by_row=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 4232d3ec2a..b203471606 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1116,18 +1116,24 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: def apply( self, func, + by_row="compat", ) -> DataFrame | Series: """ Invoke function on values of a Series. + Can be ufunc (a NumPy function that applies to the entire Series) or a + Python function that only works on single values. If it is an arbitrary + python function then converting it into a `remote_function` is recommended. + **Examples:** >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + For applying arbitrary python function a `remote_funciton` is recommended. + Let's use ``reuse=False`` flag to make sure a new `remote_function` is created every time we run the following code, but you can skip it - to potentially reuse a previously deployed ``remote_function`` from + to potentially reuse a previously deployed `remote_function` from the same user defined function. >>> @bpd.remote_function([int], float, reuse=False) @@ -1152,9 +1158,9 @@ def apply( 4 2.0 dtype: Float64 - You could turn a user defined function with external package - dependencies into a BigQuery DataFrames remote function. You would - provide the names of the packages via ``packages`` param. + To turn a user defined function with external package dependencies into + a `remote_function`, you would provide the names of the packages via + `packages` param. >>> @bpd.remote_function( ... [str], @@ -1176,11 +1182,48 @@ def apply( >>> names = bpd.Series(["Alice", "Bob"]) >>> hashes = names.apply(get_hash) + Simple vectorized functions, lambdas or ufuncs can be applied directly + with `by_row=False`. + + >>> nums = bpd.Series([1, 2, 3, 4]) + >>> nums + 0 1 + 1 2 + 2 3 + 3 4 + dtype: Int64 + >>> nums.apply(lambda x: x*x + 2*x + 1, by_row=False) + 0 4 + 1 9 + 2 16 + 3 25 + dtype: Int64 + + >>> def is_odd(num): + ... return num % 2 == 1 + >>> nums.apply(is_odd, by_row=False) + 0 True + 1 False + 2 True + 3 False + dtype: boolean + + >>> nums.apply(np.log, by_row=False) + 0 0.0 + 1 0.693147 + 2 1.098612 + 3 1.386294 + dtype: Float64 + Args: func (function): BigFrames DataFrames ``remote_function`` to apply. The function should take a scalar and return a scalar. It will be applied to every element in the ``Series``. + by_row (False or "compat", default "compat"): + If `"compat"` , func must be a remote function which will be + passed each element of the Series, like `Series.map`. If False, + the func will be passed the whole Series at once. Returns: bigframes.series.Series: A new Series with values representing the @@ -2680,7 +2723,8 @@ def mask(self, cond, other): dtype: Int64 You can mask the values in the Series based on a condition. The values - matching the condition would be masked. + matching the condition would be masked. The condition can be provided in + formm of a Series. >>> s.mask(s % 2 == 0) 0 @@ -2736,6 +2780,32 @@ def mask(self, cond, other): 2 Caroline dtype: string + Simple vectorized (i.e. they only perform operations supported on a + Series) lambdas or python functions can be used directly. + + >>> nums = bpd.Series([1, 2, 3, 4], name="nums") + >>> nums + 0 1 + 1 2 + 2 3 + 3 4 + Name: nums, dtype: Int64 + >>> nums.mask(lambda x: (x+1) % 2 == 1) + 0 1 + 1 + 2 3 + 3 + Name: nums, dtype: Int64 + + >>> def is_odd(num): + ... return num % 2 == 1 + >>> nums.mask(is_odd) + 0 + 1 2 + 2 + 3 4 + Name: nums, dtype: Int64 + Args: cond (bool Series/DataFrame, array-like, or callable): Where cond is False, keep the original value. Where True, replace From 90caf865efc940f94e16643bda7ba261c2f2e473 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:40:20 -0800 Subject: [PATCH 08/10] feat: support read_gbq wildcard table path (#377) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # ๐Ÿฆ• --- bigframes/pandas/__init__.py | 2 + bigframes/session/__init__.py | 64 +++++++++++++------ tests/system/small/test_session.py | 26 ++++++++ tests/unit/session/test_session.py | 31 ++++++++- .../bigframes_vendored/pandas/io/gbq.py | 15 ++++- 5 files changed, 112 insertions(+), 26 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5320e84e21..110978a7f1 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -551,6 +551,7 @@ def read_gbq_table( index_col: Iterable[str] | str = (), columns: Iterable[str] = (), max_results: Optional[int] = None, + filters: vendored_pandas_gbq.FiltersType = (), use_cache: bool = True, col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: @@ -561,6 +562,7 @@ def read_gbq_table( index_col=index_col, columns=columns, max_results=max_results, + filters=filters, use_cache=use_cache, col_order=col_order, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 15d4b3577b..df0cd6e947 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -30,6 +30,7 @@ Iterable, List, Literal, + Mapping, MutableSequence, Optional, Sequence, @@ -115,6 +116,11 @@ def _is_query(query_or_table: str) -> bool: return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None +def _is_table_with_wildcard_suffix(query_or_table: str) -> bool: + """Determine if `query_or_table` is a table and contains a wildcard suffix.""" + return not _is_query(query_or_table) and query_or_table.endswith("*") + + class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, @@ -248,7 +254,9 @@ def read_gbq( elif col_order: columns = col_order - query_or_table = self._filters_to_query(query_or_table, columns, filters) + filters = list(filters) + if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table): + query_or_table = self._to_query(query_or_table, columns, filters) if _is_query(query_or_table): return self._read_gbq_query( @@ -272,13 +280,18 @@ def read_gbq( use_cache=use_cache, ) - def _filters_to_query(self, query_or_table, columns, filters): - """Convert filters to query""" - if len(filters) == 0: - return query_or_table - + def _to_query( + self, + query_or_table: str, + columns: Iterable[str], + filters: third_party_pandas_gbq.FiltersType, + ) -> str: + """Compile query_or_table with conditions(filters, wildcards) to query.""" + filters = list(filters) sub_query = ( - f"({query_or_table})" if _is_query(query_or_table) else query_or_table + f"({query_or_table})" + if _is_query(query_or_table) + else f"`{query_or_table}`" ) select_clause = "SELECT " + ( @@ -287,7 +300,7 @@ def _filters_to_query(self, query_or_table, columns, filters): where_clause = "" if filters: - valid_operators = { + valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = { "in": "IN", "not in": "NOT IN", "==": "=", @@ -298,19 +311,16 @@ def _filters_to_query(self, query_or_table, columns, filters): "!=": "!=", } - if ( - isinstance(filters, Iterable) - and isinstance(filters[0], Tuple) - and (len(filters[0]) == 0 or not isinstance(filters[0][0], Tuple)) + # If single layer filter, add another pseudo layer. So the single layer represents "and" logic. + if isinstance(filters[0], tuple) and ( + len(filters[0]) == 0 or not isinstance(list(filters[0])[0], tuple) ): - filters = [filters] + filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters]) or_expressions = [] for group in filters: if not isinstance(group, Iterable): - raise ValueError( - f"Filter group should be a iterable, {group} is not valid." - ) + group = [group] and_expressions = [] for filter_item in group: @@ -329,13 +339,13 @@ def _filters_to_query(self, query_or_table, columns, filters): if operator not in valid_operators: raise ValueError(f"Operator {operator} is not valid.") - operator = valid_operators[operator] + operator_str = valid_operators[operator] - if operator in ["IN", "NOT IN"]: + if operator_str in ["IN", "NOT IN"]: value_list = ", ".join([repr(v) for v in value]) - expression = f"`{column}` {operator} ({value_list})" + expression = f"`{column}` {operator_str} ({value_list})" else: - expression = f"`{column}` {operator} {repr(value)}" + expression = f"`{column}` {operator_str} {repr(value)}" and_expressions.append(expression) or_expressions.append(" AND ".join(and_expressions)) @@ -521,6 +531,7 @@ def read_gbq_table( index_col: Iterable[str] | str = (), columns: Iterable[str] = (), max_results: Optional[int] = None, + filters: third_party_pandas_gbq.FiltersType = (), use_cache: bool = True, col_order: Iterable[str] = (), ) -> dataframe.DataFrame: @@ -546,6 +557,19 @@ def read_gbq_table( elif col_order: columns = col_order + filters = list(filters) + if len(filters) != 0 or _is_table_with_wildcard_suffix(query): + query = self._to_query(query, columns, filters) + + return self._read_gbq_query( + query, + index_col=index_col, + columns=columns, + max_results=max_results, + api_name="read_gbq_table", + use_cache=use_cache, + ) + return self._read_gbq_table( query=query, index_col=index_col, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2d9c332de1..85573472b9 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -327,6 +327,32 @@ def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id): assert df3 is not None +def test_read_gbq_wildcard(session: bigframes.Session): + df = session.read_gbq("bigquery-public-data.noaa_gsod.gsod193*") + assert df.shape == (348485, 32) + + +def test_read_gbq_wildcard_with_filter(session: bigframes.Session): + df = session.read_gbq( + "bigquery-public-data.noaa_gsod.gsod19*", + filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")], # type: ignore + ) + assert df.shape == (348485, 32) + + +def test_read_gbq_table_wildcard(session: bigframes.Session): + df = session.read_gbq_table("bigquery-public-data.noaa_gsod.gsod193*") + assert df.shape == (348485, 32) + + +def test_read_gbq_table_wildcard_with_filter(session: bigframes.Session): + df = session.read_gbq_table( + "bigquery-public-data.noaa_gsod.gsod19*", + filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")], # type: ignore + ) + assert df.shape == (348485, 32) + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index ea8d0882ae..b474c9f63e 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -125,7 +125,7 @@ def test_session_init_fails_with_no_project(): "test_table", [], [("date_col", ">", "2022-10-20")], - "SELECT * FROM test_table AS sub WHERE `date_col` > '2022-10-20'", + "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'", id="table_input", ), pytest.param( @@ -136,7 +136,7 @@ def test_session_init_fails_with_no_project(): (("string_col", "in", ["Hello, World!", "ใ“ใ‚“ใซใกใฏ"]),), ], ( - "SELECT `row_index`, `string_col` FROM test_table AS sub WHERE " + "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE " "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " "'ใ“ใ‚“ใซใกใฏ')" ), @@ -156,5 +156,30 @@ def test_session_init_fails_with_no_project(): ) def test_read_gbq_with_filters(query_or_table, columns, filters, expected_output): session = resources.create_bigquery_session() - query = session._filters_to_query(query_or_table, columns, filters) + query = session._to_query(query_or_table, columns, filters) + assert query == expected_output + + +@pytest.mark.parametrize( + ("query_or_table", "columns", "filters", "expected_output"), + [ + pytest.param( + "test_table*", + [], + [], + "SELECT * FROM `test_table*` AS sub", + id="wildcard_table_input", + ), + pytest.param( + "test_table*", + [], + [("_TABLE_SUFFIX", ">", "2022-10-20")], + "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'", + id="wildcard_table_input_with_filter", + ), + ], +) +def test_read_gbq_wildcard(query_or_table, columns, filters, expected_output): + session = resources.create_bigquery_session() + query = session._to_query(query_or_table, columns, filters) assert query == expected_output diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 8e2c9f092d..1f31c530d2 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -7,8 +7,9 @@ from bigframes import constants -FilterType = Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any] -FiltersType = Iterable[Union[FilterType, Iterable[FilterType]]] +FilterOps = Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"] +FilterType = Tuple[str, FilterOps, Any] +FiltersType = Union[Iterable[FilterType], Iterable[Iterable[FilterType]]] class GBQIOMixin: @@ -52,6 +53,9 @@ def read_gbq( >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + Read table path with wildcard suffix and filters: + >>> df = bpd.read_gbq_table("bigquery-public-data.noaa_gsod.gsod19*", filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")]) + Preserve ordering in a query input. >>> df = bpd.read_gbq(''' @@ -96,6 +100,8 @@ def read_gbq( A SQL string to be executed or a BigQuery table to be read. The table must be specified in the format of `project.dataset.tablename` or `dataset.tablename`. + Can also take wildcard table name, such as `project.dataset.table_prefix*`. + In tha case, will read all the matched table as one DataFrame. index_col (Iterable[str] or str): Name of result column(s) to use for index in results DataFrame. columns (Iterable[str]): @@ -104,7 +110,7 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. - filters (Iterable[Union[Tuple, Iterable[Tuple]]], default ()): To + filters (Union[Iterable[FilterType], Iterable[Iterable[FilterType]]], default ()): To filter out data. Filter syntax: [[(column, op, val), โ€ฆ],โ€ฆ] where op is [==, >, >=, <, <=, !=, in, not in]. The innermost tuples are transposed into a set of filters applied through an AND @@ -112,6 +118,9 @@ def read_gbq( through an OR operation. A single Iterable of tuples can also be used, meaning that no OR operation between set of filters is to be conducted. + If using wildcard table suffix in query_or_table, can specify + '_table_suffix' pseudo column to filter the tables to be read + into the DataFrame. use_cache (bool, default True): Whether to cache the query inputs. Default to True. col_order (Iterable[str]): From 126f5660bd61bd8998e5f17ca0cbd39959590367 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 13 Feb 2024 11:00:21 -0800 Subject: [PATCH 09/10] feat: add ml.metrics.pairwise.cosine_similarity function (#374) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # ๐Ÿฆ• --- bigframes/ml/core.py | 97 +++++++++++++------ bigframes/ml/metrics/__init__.py | 39 ++++++++ .../ml/{metrics.py => metrics/_metrics.py} | 0 bigframes/ml/metrics/pairwise.py | 34 +++++++ bigframes/ml/sql.py | 15 ++- docs/templates/toc.yml | 4 + .../system/small/ml/test_metrics_pairwise.py | 35 +++++++ tests/unit/ml/test_sql.py | 55 ++++++----- .../sklearn/metrics/pairwise.py | 31 ++++++ 9 files changed, 260 insertions(+), 50 deletions(-) create mode 100644 bigframes/ml/metrics/__init__.py rename bigframes/ml/{metrics.py => metrics/_metrics.py} (100%) create mode 100644 bigframes/ml/metrics/pairwise.py create mode 100644 tests/system/small/ml/test_metrics_pairwise.py create mode 100644 third_party/bigframes_vendored/sklearn/metrics/pairwise.py diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 51fed05901..c496133aa7 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -17,7 +17,7 @@ from __future__ import annotations import datetime -from typing import Callable, cast, Iterable, Mapping, Optional, Union +from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union import uuid from google.cloud import bigquery @@ -28,34 +28,12 @@ import bigframes.pandas as bpd -class BqmlModel: - """Represents an existing BQML model in BigQuery. - - Wraps the BQML API and SQL interface to expose the functionality needed for - BigQuery DataFrames ML. - """ +class BaseBqml: + """Base class for BQML functionalities.""" - def __init__(self, session: bigframes.Session, model: bigquery.Model): + def __init__(self, session: bigframes.Session): self._session = session - self._model = model - self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( - self.model_name - ) - - @property - def session(self) -> bigframes.Session: - """Get the BigQuery DataFrames session that this BQML model wrapper is tied to""" - return self._session - - @property - def model_name(self) -> str: - """Get the fully qualified name of the model, i.e. project_id.dataset_id.model_id""" - return f"{self._model.project}.{self._model.dataset_id}.{self._model.model_id}" - - @property - def model(self) -> bigquery.Model: - """Get the BQML model associated with this wrapper""" - return self._model + self._base_sql_generator = ml_sql.BaseSqlGenerator() def _apply_sql( self, @@ -84,6 +62,71 @@ def _apply_sql( return df + def distance( + self, + x: bpd.DataFrame, + y: bpd.DataFrame, + type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"], + name: str, + ) -> bpd.DataFrame: + """Calculate ML.DISTANCE from DataFrame inputs. + + Args: + x: + input DataFrame + y: + input DataFrame + type: + Distance types, accept values are "EUCLIDEAN", "MANHATTAN", "COSINE". + name: + name of the output result column + """ + assert len(x.columns) == 1 and len(y.columns) == 1 + + input_data = x._cached().join(y._cached(), how="outer") + x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] + + return self._apply_sql( + input_data, + lambda source_df: self._base_sql_generator.ml_distance( + x_column_id, + y_column_id, + type=type, + source_df=source_df, + name=name, + ), + ) + + +class BqmlModel(BaseBqml): + """Represents an existing BQML model in BigQuery. + + Wraps the BQML API and SQL interface to expose the functionality needed for + BigQuery DataFrames ML. + """ + + def __init__(self, session: bigframes.Session, model: bigquery.Model): + self._session = session + self._model = model + self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( + self.model_name + ) + + @property + def session(self) -> bigframes.Session: + """Get the BigQuery DataFrames session that this BQML model wrapper is tied to""" + return self._session + + @property + def model_name(self) -> str: + """Get the fully qualified name of the model, i.e. project_id.dataset_id.model_id""" + return f"{self._model.project}.{self._model.dataset_id}.{self._model.model_id}" + + @property + def model(self) -> bigquery.Model: + """Get the BQML model associated with this wrapper""" + return self._model + def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py new file mode 100644 index 0000000000..6b0a243426 --- /dev/null +++ b/bigframes/ml/metrics/__init__.py @@ -0,0 +1,39 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.ml.metrics import pairwise +from bigframes.ml.metrics._metrics import ( + accuracy_score, + auc, + confusion_matrix, + f1_score, + precision_score, + r2_score, + recall_score, + roc_auc_score, + roc_curve, +) + +__all__ = [ + "r2_score", + "recall_score", + "accuracy_score", + "roc_curve", + "roc_auc_score", + "auc", + "confusion_matrix", + "precision_score", + "f1_score", + "pairwise", +] diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics/_metrics.py similarity index 100% rename from bigframes/ml/metrics.py rename to bigframes/ml/metrics/_metrics.py diff --git a/bigframes/ml/metrics/pairwise.py b/bigframes/ml/metrics/pairwise.py new file mode 100644 index 0000000000..04577c89d3 --- /dev/null +++ b/bigframes/ml/metrics/pairwise.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Union + +from bigframes.ml import core, utils +import bigframes.pandas as bpd +import third_party.bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise + + +def cosine_similarity( + X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series] +) -> bpd.DataFrame: + X, Y = utils.convert_to_dataframe(X, Y) + if len(X.columns) != 1 or len(Y.columns) != 1: + raise ValueError("Inputs X and Y can only contain 1 column.") + + base_bqml = core.BaseBqml(session=X._session) + return base_bqml.distance(X, Y, type="COSINE", name="cosine_similarity") + + +cosine_similarity.__doc__ = inspect.getdoc(vendored_metrics_pairwise.cosine_similarity) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index f726317bb4..7999cb90a3 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -16,7 +16,7 @@ Generates SQL queries needed for BigQuery DataFrames ML """ -from typing import Iterable, Mapping, Optional, Union +from typing import Iterable, Literal, Mapping, Optional, Union import google.cloud.bigquery @@ -133,6 +133,19 @@ def ml_label_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_distance( + self, + col_x: str, + col_y: str, + type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"], + source_df: bpd.DataFrame, + name: str, + ) -> str: + """Encode ML.DISTANCE for BQML. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance""" + source_sql, _, _ = source_df._to_sql_query(include_index=True) + return f"""SELECT *, ML.DISTANCE({col_x}, {col_y}, '{type}') AS {name} FROM ({source_sql})""" + class ModelCreationSqlGenerator(BaseSqlGenerator): """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index e921279418..90dbc504b0 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -116,6 +116,10 @@ - name: metrics uid: bigframes.ml.metrics name: metrics + - items: + - name: metrics.pairwise + uid: bigframes.ml.metrics.pairwise + name: metrics.pairwise - items: - name: model_selection uid: bigframes.ml.model_selection diff --git a/tests/system/small/ml/test_metrics_pairwise.py b/tests/system/small/ml/test_metrics_pairwise.py new file mode 100644 index 0000000000..c02a36abbc --- /dev/null +++ b/tests/system/small/ml/test_metrics_pairwise.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from bigframes.ml import metrics +import bigframes.pandas as bpd + + +def test_cosine_similarity(): + x_col = [np.array([4.1, 0.5, 1.0])] + y_col = [np.array([3.0, 0.0, 2.5])] + X = bpd.read_pandas(pd.DataFrame({"X": x_col})) + Y = bpd.read_pandas(pd.DataFrame({"Y": y_col})) + + result = metrics.pairwise.cosine_similarity(X, Y) + expected_pd_df = pd.DataFrame( + {"X": x_col, "Y": y_col, "cosine_similarity": [0.108199]} + ) + + pd.testing.assert_frame_equal( + result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False + ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index de80dad710..52c10f3144 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -47,7 +47,7 @@ def mock_df(): return mock_df -def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): +def test_options_correct(base_sql_generator: ml_sql.BaseSqlGenerator): sql = base_sql_generator.options( model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6 ) @@ -60,7 +60,7 @@ def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerato ) -def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): +def test_transform_correct(base_sql_generator: ml_sql.BaseSqlGenerator): sql = base_sql_generator.transform( "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b", @@ -75,35 +75,35 @@ def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenera ) -def test_standard_scaler_produces_correct_sql( +def test_standard_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_standard_scaler("col_a", "scaled_col_a") assert sql == "ML.STANDARD_SCALER(col_a) OVER() AS scaled_col_a" -def test_max_abs_scaler_produces_correct_sql( +def test_max_abs_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_max_abs_scaler("col_a", "scaled_col_a") assert sql == "ML.MAX_ABS_SCALER(col_a) OVER() AS scaled_col_a" -def test_min_max_scaler_produces_correct_sql( +def test_min_max_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_min_max_scaler("col_a", "scaled_col_a") assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" -def test_k_bins_discretizer_produces_correct_sql( +def test_k_bins_discretizer_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a") assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" -def test_one_hot_encoder_produces_correct_sql( +def test_one_hot_encoder_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_one_hot_encoder( @@ -114,14 +114,25 @@ def test_one_hot_encoder_produces_correct_sql( ) -def test_label_encoder_produces_correct_sql( +def test_label_encoder_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_label_encoder("col_a", 1000000, 0, "encoded_col_a") assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" -def test_create_model_produces_correct_sql( +def test_distance_correct( + base_sql_generator: ml_sql.BaseSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = base_sql_generator.ml_distance("col_a", "col_b", "COSINE", mock_df, "cosine") + assert ( + sql + == "SELECT *, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM (input_X_sql)" + ) + + +def test_create_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -142,7 +153,7 @@ def test_create_model_produces_correct_sql( ) -def test_create_model_transform_produces_correct_sql( +def test_create_model_transform_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -170,7 +181,7 @@ def test_create_model_transform_produces_correct_sql( ) -def test_create_remote_model_produces_correct_sql( +def test_create_remote_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_remote_model( @@ -190,7 +201,7 @@ def test_create_remote_model_produces_correct_sql( ) -def test_create_remote_model_with_params_produces_correct_sql( +def test_create_remote_model_with_params_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_remote_model( @@ -216,7 +227,7 @@ def test_create_remote_model_with_params_produces_correct_sql( ) -def test_create_imported_model_produces_correct_sql( +def test_create_imported_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_imported_model( @@ -273,7 +284,7 @@ def test_alter_model_correct_sql( ) -def test_ml_predict_produces_correct_sql( +def test_ml_predict_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -285,7 +296,7 @@ def test_ml_predict_produces_correct_sql( ) -def test_ml_evaluate_produces_correct_sql( +def test_ml_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -297,7 +308,7 @@ def test_ml_evaluate_produces_correct_sql( ) -def test_ml_arima_evaluate_produces_correct_sql( +def test_ml_arima_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_arima_evaluate( @@ -310,7 +321,7 @@ def test_ml_arima_evaluate_produces_correct_sql( ) -def test_ml_evaluate_no_source_produces_correct_sql( +def test_ml_evaluate_no_source_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_evaluate() @@ -320,7 +331,7 @@ def test_ml_evaluate_no_source_produces_correct_sql( ) -def test_ml_centroids_produces_correct_sql( +def test_ml_centroids_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_centroids() @@ -346,7 +357,7 @@ def test_forecast_correct_sql( ) -def test_ml_generate_text_produces_correct_sql( +def test_ml_generate_text_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -363,7 +374,7 @@ def test_ml_generate_text_produces_correct_sql( ) -def test_ml_generate_text_embedding_produces_correct_sql( +def test_ml_generate_text_embedding_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -380,7 +391,7 @@ def test_ml_generate_text_embedding_produces_correct_sql( ) -def test_ml_principal_components_produces_correct_sql( +def test_ml_principal_components_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_principal_components() @@ -390,7 +401,7 @@ def test_ml_principal_components_produces_correct_sql( ) -def test_ml_principal_component_info_produces_correct_sql( +def test_ml_principal_component_info_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_principal_component_info() diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py new file mode 100644 index 0000000000..3ef5431178 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py @@ -0,0 +1,31 @@ +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Robert Layton +# Andreas Mueller +# Philippe Gervais +# Lars Buitinck +# Joel Nothman +# License: BSD 3 clause + +from bigframes import constants +import bigframes.pandas as bpd + + +def cosine_similarity(X, Y) -> bpd.DataFrame: + """Compute cosine similarity between samples in X and Y. + + Cosine similarity, or the cosine kernel, computes similarity as the + normalized dot product of X and Y: + + K(X, Y) = / (||X||*||Y||) + + Args: + X (Series or single column DataFrame of array of numeric type): + Input data. + Y (Series or single column DataFrame of array of numeric type): + Input data. X and Y are mapped by indexes, must have the same index. + + Returns: + bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and cosine_similarity + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 747e5f64fa5e8d0b9af4d75c293590705b447cf1 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 13 Feb 2024 14:09:17 -0800 Subject: [PATCH 10/10] chore(main): release 0.21.0 (#373) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 23 +++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e2ab05f47..0091fb4a11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,29 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.20.1...v0.21.0) (2024-02-13) + + +### Features + +* Add `Series.cov` method ([#368](https://github.com/googleapis/python-bigquery-dataframes/issues/368)) ([443db22](https://github.com/googleapis/python-bigquery-dataframes/commit/443db228375da9b232376140c9d5b0db14895eae)) +* Add ml.llm.GeminiTextGenerator model ([#370](https://github.com/googleapis/python-bigquery-dataframes/issues/370)) ([de1e0a4](https://github.com/googleapis/python-bigquery-dataframes/commit/de1e0a451785e679f37b083be6d58c267319f56a)) +* Add ml.metrics.pairwise.cosine_similarity function ([#374](https://github.com/googleapis/python-bigquery-dataframes/issues/374)) ([126f566](https://github.com/googleapis/python-bigquery-dataframes/commit/126f5660bd61bd8998e5f17ca0cbd39959590367)) +* Add XGBoostModel ([#363](https://github.com/googleapis/python-bigquery-dataframes/issues/363)) ([d5518b2](https://github.com/googleapis/python-bigquery-dataframes/commit/d5518b28509be0ce070b22d9134a6a662412010a)) +* Limited support of lambdas in `Series.apply` ([#345](https://github.com/googleapis/python-bigquery-dataframes/issues/345)) ([208e081](https://github.com/googleapis/python-bigquery-dataframes/commit/208e081fa99e17b8085e83c111c07eb6fc5c4730)) +* Support bigframes.pandas.to_datetime for scalars, iterables and series. ([#372](https://github.com/googleapis/python-bigquery-dataframes/issues/372)) ([ffb0d15](https://github.com/googleapis/python-bigquery-dataframes/commit/ffb0d15602fe4d86e7a1aad72bba0a7049193a14)) +* Support read_gbq wildcard table path ([#377](https://github.com/googleapis/python-bigquery-dataframes/issues/377)) ([90caf86](https://github.com/googleapis/python-bigquery-dataframes/commit/90caf865efc940f94e16643bda7ba261c2f2e473)) + + +### Bug Fixes + +* Error message fix. ([#375](https://github.com/googleapis/python-bigquery-dataframes/issues/375)) ([930cf6b](https://github.com/googleapis/python-bigquery-dataframes/commit/930cf6b9ae8a48f422586dbd21b52e15c9ef9492)) + + +### Documentation + +* Clarify ADC pre-auth in a non-interactive environment ([#348](https://github.com/googleapis/python-bigquery-dataframes/issues/348)) ([99a9e6e](https://github.com/googleapis/python-bigquery-dataframes/commit/99a9e6e15c6eef4297035ce89bb619f8e4ca54ff)) + ## [0.20.1](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.20.0...v0.20.1) (2024-02-06) diff --git a/bigframes/version.py b/bigframes/version.py index fca3eec023..a713192ada 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.20.1" +__version__ = "0.21.0"