diff --git a/CHANGELOG.md b/CHANGELOG.md index bee4ecf095..24a1d8cb62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,15 +4,47 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.37.0...v1.38.0) (2025-02-24) + + +### Features + +* (Preview) Support diff aggregation for timestamp series. ([#1405](https://github.com/googleapis/python-bigquery-dataframes/issues/1405)) ([abe48d6](https://github.com/googleapis/python-bigquery-dataframes/commit/abe48d6f13a954534460fa14c9337e1085d9fbb3)) +* Add `GeoSeries.from_wkt() `and `GeoSeries.to_wkt()` ([#1401](https://github.com/googleapis/python-bigquery-dataframes/issues/1401)) ([2993b28](https://github.com/googleapis/python-bigquery-dataframes/commit/2993b283966960430ad8482f40f177e276db2d64)) +* Support DF.__array__(copy=True) ([#1403](https://github.com/googleapis/python-bigquery-dataframes/issues/1403)) ([693ed8c](https://github.com/googleapis/python-bigquery-dataframes/commit/693ed8cfb1ecc3af161801225d3e9cda489c29dd)) +* Support routines with ARRAY return type in `read_gbq_function` ([#1412](https://github.com/googleapis/python-bigquery-dataframes/issues/1412)) ([4b60049](https://github.com/googleapis/python-bigquery-dataframes/commit/4b60049e8362bfb07c136d8b2eb02b984d71f084)) + + +### Bug Fixes + +* Calling to_timdelta() over timedeltas no longer changes their values ([#1411](https://github.com/googleapis/python-bigquery-dataframes/issues/1411)) ([650a190](https://github.com/googleapis/python-bigquery-dataframes/commit/650a1907fdf84897eb7aa288863ee27d938e0879)) +* Replace empty dict with None to avoid mutable default arguments ([#1416](https://github.com/googleapis/python-bigquery-dataframes/issues/1416)) ([fa4e3ad](https://github.com/googleapis/python-bigquery-dataframes/commit/fa4e3ad8bcd5db56fa26b26609cc7e58b1edf498)) + + +### Performance Improvements + +* Avoid redundant SQL casts ([#1399](https://github.com/googleapis/python-bigquery-dataframes/issues/1399)) ([6ee48d5](https://github.com/googleapis/python-bigquery-dataframes/commit/6ee48d5c16870f1caa99c3f658c2c1a0e14be749)) + + +### Dependencies + +* Remove scikit-learn and sqlalchemy as required dependencies ([#1296](https://github.com/googleapis/python-bigquery-dataframes/issues/1296)) ([fd8bc89](https://github.com/googleapis/python-bigquery-dataframes/commit/fd8bc894bdbdf551ebbec1fb93832588371ae6af)) + + +### Documentation + +* Add samples using SQL methods via the `bigframes.bigquery` module ([#1358](https://github.com/googleapis/python-bigquery-dataframes/issues/1358)) ([f54e768](https://github.com/googleapis/python-bigquery-dataframes/commit/f54e7688fda6372c6decc9b61796b0272d803c79)) +* Add snippets for visualizing a time series and creating a time series model for the Limit forecasted values in time series model tutorial ([#1310](https://github.com/googleapis/python-bigquery-dataframes/issues/1310)) ([c6c9120](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c9120e839647e5b3cb97f04a8d90cc8690b8a3)) + ## [1.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.36.0...v1.37.0) (2025-02-19) ### Features +* (Preview) Support add, sub, mult, div, and more between timedeltas ([#1396](https://github.com/googleapis/python-bigquery-dataframes/issues/1396)) ([ffa63d4](https://github.com/googleapis/python-bigquery-dataframes/commit/ffa63d47ca1dd1a18617f44d9b3bc33419656a20)) +* (Preview) Support comparison, ordering, and filtering for timedeltas ([#1387](https://github.com/googleapis/python-bigquery-dataframes/issues/1387)) ([34d01b2](https://github.com/googleapis/python-bigquery-dataframes/commit/34d01b27f867abf10bddffdf4f88fa7052cd237c)) +* (Preview) Support subtraction in DATETIME/TIMESTAMP columns with timedelta columns ([#1390](https://github.com/googleapis/python-bigquery-dataframes/issues/1390)) ([50ad3a5](https://github.com/googleapis/python-bigquery-dataframes/commit/50ad3a56e9bd77bb77d60d7d5ec497e3335a7177)) * JSON dtype support for read_pandas and Series constructor ([#1391](https://github.com/googleapis/python-bigquery-dataframes/issues/1391)) ([44f4137](https://github.com/googleapis/python-bigquery-dataframes/commit/44f4137adb02790e07c696f0641bc58390857210)) -* Support add, sub, mult, div, and more between timedeltas ([#1396](https://github.com/googleapis/python-bigquery-dataframes/issues/1396)) ([ffa63d4](https://github.com/googleapis/python-bigquery-dataframes/commit/ffa63d47ca1dd1a18617f44d9b3bc33419656a20)) -* Support comparison, ordering, and filtering for timedeltas ([#1387](https://github.com/googleapis/python-bigquery-dataframes/issues/1387)) ([34d01b2](https://github.com/googleapis/python-bigquery-dataframes/commit/34d01b27f867abf10bddffdf4f88fa7052cd237c)) -* Support subtraction in DATETIME/TIMESTAMP columns with timedelta columns ([#1390](https://github.com/googleapis/python-bigquery-dataframes/issues/1390)) ([50ad3a5](https://github.com/googleapis/python-bigquery-dataframes/commit/50ad3a56e9bd77bb77d60d7d5ec497e3335a7177)) ### Bug Fixes @@ -36,6 +68,9 @@ ### Features +* (Preview) Support addition between a timestamp and a timedelta ([#1369](https://github.com/googleapis/python-bigquery-dataframes/issues/1369)) ([b598aa8](https://github.com/googleapis/python-bigquery-dataframes/commit/b598aa8ef4f6dd0cbca7629d290c5e511cdc86fc)) +* (Preview) Support casting floats and list-likes to timedelta series ([#1362](https://github.com/googleapis/python-bigquery-dataframes/issues/1362)) ([65933b6](https://github.com/googleapis/python-bigquery-dataframes/commit/65933b6b7608ec52717e818d8ec1732fb756b67b)) +* (Preview) Support timestamp subtractions ([#1346](https://github.com/googleapis/python-bigquery-dataframes/issues/1346)) ([86b7e72](https://github.com/googleapis/python-bigquery-dataframes/commit/86b7e72097ce67d88b72cfe031080d5af22f65cd)) * Add `bigframes.bigquery.st_area` and suggest it from `GeoSeries.area` ([#1318](https://github.com/googleapis/python-bigquery-dataframes/issues/1318)) ([8b5ffa8](https://github.com/googleapis/python-bigquery-dataframes/commit/8b5ffa8893b51016c51794865c40def74ea6716b)) * Add `GeoSeries.from_xy()` ([#1364](https://github.com/googleapis/python-bigquery-dataframes/issues/1364)) ([3c3e14c](https://github.com/googleapis/python-bigquery-dataframes/commit/3c3e14c715f476ca44f254c0d53d639ea5988a8d)) @@ -56,6 +91,7 @@ ### Features +* (Preview) Support timedeltas for read_pandas() ([#1349](https://github.com/googleapis/python-bigquery-dataframes/issues/1349)) ([866ba9e](https://github.com/googleapis/python-bigquery-dataframes/commit/866ba9efb54f11c1fc2ced0d7995fff86277b049)) * Add Series.keys() ([#1342](https://github.com/googleapis/python-bigquery-dataframes/issues/1342)) ([deb015d](https://github.com/googleapis/python-bigquery-dataframes/commit/deb015dc1276549519d51363501355272f8976d8)) * Allow `case_when` to change dtypes if case list contains the condition `(True, some_default_value)` ([#1311](https://github.com/googleapis/python-bigquery-dataframes/issues/1311)) ([5c2a2c6](https://github.com/googleapis/python-bigquery-dataframes/commit/5c2a2c6086be20cba7da08ecd37899699aab518f)) * Support python type as astype arg ([#1316](https://github.com/googleapis/python-bigquery-dataframes/issues/1316)) ([b26e135](https://github.com/googleapis/python-bigquery-dataframes/commit/b26e13570f198ec4d252590a8c07253624db667a)) @@ -96,6 +132,7 @@ ### Features * (df|s).hist(), (df|s).line(), (df|s).area(), (df|s).bar(), df.scatter() ([#1320](https://github.com/googleapis/python-bigquery-dataframes/issues/1320)) ([bd3f584](https://github.com/googleapis/python-bigquery-dataframes/commit/bd3f584a7eab5d01dedebb7ca2485942ef5b5ebe)) +* (Preview) Define timedelta type and to_timedelta function ([#1317](https://github.com/googleapis/python-bigquery-dataframes/issues/1317)) ([3901951](https://github.com/googleapis/python-bigquery-dataframes/commit/39019510d0c2758096589ecd0d83175f313a8cf5)) * Add DataFrame.corrwith method ([#1315](https://github.com/googleapis/python-bigquery-dataframes/issues/1315)) ([b503355](https://github.com/googleapis/python-bigquery-dataframes/commit/b5033559a77a9bc5ffb7dc1e44e02aaaaf1e051e)) * Add DataFrame.mask method ([#1302](https://github.com/googleapis/python-bigquery-dataframes/issues/1302)) ([8b8155f](https://github.com/googleapis/python-bigquery-dataframes/commit/8b8155fef9c5cd36cfabf728ccebf6a14a1cbbda)) * Enable reading JSON data with `dbjson` extension dtype ([#1139](https://github.com/googleapis/python-bigquery-dataframes/issues/1139)) ([f672262](https://github.com/googleapis/python-bigquery-dataframes/commit/f6722629fb47eed5befb0ecae2e6b5ec9042d669)) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 262ced4fe8..7b8e47e2da 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -24,7 +24,7 @@ """ -def st_area(self) -> bigframes.series.Series: +def st_area(series: bigframes.series.Series) -> bigframes.series.Series: """ Returns the area in square meters covered by the polygons in the input GEOGRAPHY. @@ -35,7 +35,7 @@ def st_area(self) -> bigframes.series.Series: ..note:: - BigQuery's Geography functions, like `st_area`, interpet the geomtry + BigQuery's Geography functions, like `st_area`, interpret the geometry data type as a point set on the Earth's surface. A point set is a set of points, lines, and polygons on the WGS84 reference spheroid, with geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data @@ -88,6 +88,6 @@ def st_area(self) -> bigframes.series.Series: bigframes.pandas.Series: Series of float representing the areas. """ - series = self._apply_unary_op(ops.geo_area_op) + series = series._apply_unary_op(ops.geo_area_op) series.name = None return series diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 14c7a72e78..1099535712 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -37,12 +37,17 @@ class TransformFunction: """Simple transform function class to deal with Python UDF.""" def __init__( - self, func_def: FunctionDef, session: bigframes.session.Session, connection: str + self, + func_def: FunctionDef, + session: bigframes.session.Session, + connection: str, + max_batching_rows: int, ): self._func = func_def.func self._requirements = func_def.requirements self._session = session self._connection = connection + self._max_batching_rows = max_batching_rows def _input_bq_signature(self): sig = inspect.signature(self._func) @@ -67,7 +72,7 @@ def _create_udf(self): CREATE OR REPLACE FUNCTION `{udf_name}`({self._input_bq_signature()}) RETURNS {self._output_bq_type()} LANGUAGE python WITH CONNECTION `{self._connection}` -OPTIONS (entry_point='{func_name}', runtime_version='python-3.11', packages={packages}) +OPTIONS (entry_point='{func_name}', runtime_version='python-3.11', packages={packages}, max_batching_rows={self._max_batching_rows}) AS r\"\"\" diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 91a96febe0..4ec0b270ed 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -551,6 +551,24 @@ def _( raise TypeError(f"Cannot perform diff on type{column.type()}") +@compile_unary_agg.register +def _( + op: agg_ops.TimeSeriesDiffOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: + if not column.type().is_timestamp(): + raise TypeError(f"Cannot perform time series diff on type{column.type()}") + + original_column = cast(ibis_types.TimestampColumn, column) + shifted_column = cast( + ibis_types.TimestampColumn, + compile_unary_agg(agg_ops.ShiftOp(op.periods), column, window), + ) + + return original_column.delta(shifted_column, part="microsecond") + + @compile_unary_agg.register def _( op: agg_ops.AllOp, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 9b271bf67b..77f51542b4 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -240,9 +240,7 @@ def compile_read_table_unordered( return compiled.UnorderedIR( ibis_table, tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[scan_item.source_id].name(scan_item.id.sql) - ) + ibis_table[scan_item.source_id].name(scan_item.id.sql) for scan_item in scan.items ), ) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index af2b7908ad..c47c6cf07b 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -113,7 +113,9 @@ def cast_ibis_value( Raises: TypeError: if the type cast cannot be executed""" - if value.type() == to_type: + # normalize to nullable, which doesn't impact compatibility + value_type = value.type().copy(nullable=True) + if value_type == to_type: return value # casts that just work # TODO(bmil): add to this as more casts are verified @@ -189,52 +191,39 @@ def cast_ibis_value( ibis_dtypes.multipolygon: (IBIS_GEO_TYPE,), } - value = ibis_value_to_canonical_type(value) - if value.type() in good_casts: - if to_type in good_casts[value.type()]: + if value_type in good_casts: + if to_type in good_casts[value_type]: return value.try_cast(to_type) if safe else value.cast(to_type) else: # this should never happen raise TypeError( - f"Unexpected value type {value.type()}. {constants.FEEDBACK_LINK}" + f"Unexpected value type {value_type}. {constants.FEEDBACK_LINK}" ) # casts that need some encouragement # BigQuery casts bools to lower case strings. Capitalize the result to match Pandas # TODO(bmil): remove this workaround after fixing Ibis - if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string: + if value_type == ibis_dtypes.bool and to_type == ibis_dtypes.string: if safe: return cast(ibis_types.StringValue, value.try_cast(to_type)).capitalize() else: return cast(ibis_types.StringValue, value.cast(to_type)).capitalize() - if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64: + if value_type == ibis_dtypes.bool and to_type == ibis_dtypes.float64: if safe: return value.try_cast(ibis_dtypes.int64).try_cast(ibis_dtypes.float64) else: return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) - if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: + if value_type == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: return value != ibis_types.literal(0) raise TypeError( - f"Unsupported cast {value.type()} to {to_type}. {constants.FEEDBACK_LINK}" + f"Unsupported cast {value_type} to {to_type}. {constants.FEEDBACK_LINK}" ) -def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: - """Converts an Ibis expression to canonical type. - - This is useful in cases where multiple types correspond to the same BigFrames dtype. - """ - ibis_type = value.type() - name = value.get_name() - # Allow REQUIRED fields to be joined with NULLABLE fields. - nullable_type = ibis_type.copy(nullable=True) - return value.cast(nullable_type).name(name) - - def bigframes_dtype_to_ibis_dtype( bigframes_dtype: bigframes.dtypes.Dtype, ) -> ibis_dtypes.DataType: @@ -474,10 +463,19 @@ def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType: return python_type_to_ibis_type(t) -def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: +def ibis_type_from_bigquery_type( + type_: bigquery.StandardSqlDataType, +) -> ibis_dtypes.DataType: """Convert bq type to ibis. Only to be used for remote functions, does not handle all types.""" - if tk not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: + if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: raise UnsupportedTypeError( - tk, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + ) + elif type_.type_kind == "ARRAY": + return ibis_dtypes.Array( + value_type=ibis_type_from_bigquery_type( + typing.cast(bigquery.StandardSqlDataType, type_.array_element_type) + ) ) - return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) + else: + return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index d5ce6e9e09..7111406646 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1013,6 +1013,18 @@ def geo_area_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).area() +@scalar_op_compiler.register_unary_op(ops.geo_st_astext_op) +def geo_st_astext_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).as_text() + + +@scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) +def geo_st_geogfromtext_op_impl(x: ibis_types.Value): + # Ibis doesn't seem to provide a dedicated method to cast from string to geography, + # so we use a BigQuery scalar function, st_geogfromtext(), directly. + return st_geogfromtext(x) + + @scalar_op_compiler.register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).point( @@ -1174,6 +1186,11 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): ).floor() +@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) +def timedelta_floor_op_impl(x: ibis_types.NumericValue): + return x.floor() + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): ibis_node = getattr(op.func, "ibis_node", None) @@ -1930,6 +1947,11 @@ def _ibis_num(number: float): return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) +@ibis_udf.scalar.builtin +def st_geogfromtext(a: str) -> ibis_dtypes.geography: # type: ignore + """Convert string to geography.""" + + @ibis_udf.scalar.builtin def timestamp(a: str) -> ibis_dtypes.timestamp: # type: ignore """Convert string to timestamp.""" diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index 9d52eae77d..bde1a4431c 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -22,6 +22,7 @@ from bigframes import operations as ops from bigframes.core import expression as ex from bigframes.core import nodes, schema, utils +from bigframes.operations import aggregations as aggs @dataclasses.dataclass @@ -59,6 +60,16 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod by = tuple(_rewrite_ordering_expr(x, root.schema) for x in root.by) return nodes.OrderByNode(root.child, by) + if isinstance(root, nodes.WindowOpNode): + return nodes.WindowOpNode( + root.child, + _rewrite_aggregation(root.expression, root.schema), + root.window_spec, + root.output_name, + root.never_skip_nulls, + root.skip_reproject_unsafe, + ) + return root @@ -114,6 +125,9 @@ def _rewrite_op_expr( # but for timedeltas: int(timedelta) // float => int(timedelta) return _rewrite_floordiv_op(inputs[0], inputs[1]) + if isinstance(expr.op, ops.ToTimedeltaOp): + return _rewrite_to_timedelta_op(expr.op, inputs[0]) + return _TypedExpr.create_op_expr(expr.op, *inputs) @@ -143,9 +157,9 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.mul_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -154,7 +168,7 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.div_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -163,6 +177,36 @@ def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result + + +def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): + if arg.dtype is dtypes.TIMEDELTA_DTYPE: + # Do nothing for values that are already timedeltas + return arg + + return _TypedExpr.create_op_expr(op, arg) + + +@functools.cache +def _rewrite_aggregation( + aggregation: ex.Aggregation, schema: schema.ArraySchema +) -> ex.Aggregation: + if not isinstance(aggregation, ex.UnaryAggregation): + return aggregation + if not isinstance(aggregation.op, aggs.DiffOp): + return aggregation + + if isinstance(aggregation.arg, ex.DerefOp): + input_type = schema.get_type(aggregation.arg.id.sql) + else: + input_type = aggregation.arg.dtype + + if dtypes.is_datetime_like(input_type): + return ex.UnaryAggregation( + aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg + ) + + return aggregation diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index e3808dfffd..c379db72be 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -41,8 +41,12 @@ class ArraySchema: def from_bq_table( cls, table: google.cloud.bigquery.Table, - column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {}, + column_type_overrides: typing.Optional[ + typing.Dict[str, bigframes.dtypes.Dtype] + ] = None, ): + if column_type_overrides is None: + column_type_overrides = {} items = tuple( SchemaItem(name, column_type_overrides.get(name, dtype)) for name, dtype in bigframes.dtypes.bf_type_from_type_kind( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c02b182ee3..caf1b62e07 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3705,7 +3705,9 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: + if copy is False: + raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__) @@ -4086,9 +4088,12 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) result_series.name = None - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended + # to be an array, reconstruct the array from the string assuming it + # is a json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index e4db904210..54b621a0f8 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -874,4 +874,5 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: "INT64", "INTEGER", "STRING", + "ARRAY", } diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a0518978a3..93b5c4c596 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -501,6 +501,7 @@ def try_delattr(attr): try_delattr("bigframes_remote_function") try_delattr("input_dtypes") try_delattr("output_dtype") + try_delattr("bigframes_bigquery_function_output_dtype") try_delattr("is_row_processor") try_delattr("ibis_node") @@ -589,6 +590,11 @@ def try_delattr(attr): ibis_signature.output_type ) ) + func.bigframes_bigquery_function_output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_output_type_for_bqrf + ) + ) func.is_row_processor = is_row_processor func.ibis_node = node diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index ef2c81a953..c2809b96eb 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -56,8 +56,10 @@ class ReturnTypeMissingError(ValueError): # TODO: Move this to compile folder def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature: if routine.return_type: - ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_type_kind( - routine.return_type.type_kind + ibis_output_type = ( + bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( + routine.return_type + ) ) else: raise ReturnTypeMissingError @@ -82,8 +84,8 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignatu return _utils.IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ - bigframes.core.compile.ibis_types.ibis_type_from_type_kind( - arg.data_type.type_kind + bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( + arg.data_type ) if arg.data_type else None @@ -233,6 +235,8 @@ def func(*bigframes_args, **bigframes_kwargs): else ibis_signature.output_type ) + func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type) # type: ignore + func.is_row_processor = is_row_processor # type: ignore func.ibis_node = node # type: ignore return func diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index b757e2b971..ce9a59f26a 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -17,6 +17,7 @@ import bigframes_vendored.geopandas.geoseries as vendored_geoseries import geopandas.array # type: ignore +import bigframes.geopandas import bigframes.operations as ops import bigframes.series @@ -67,6 +68,12 @@ def area(self, crs=None) -> bigframes.series.Series: # type: ignore f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" ) + @classmethod + def from_wkt(cls, data, index=None) -> GeoSeries: + series = bigframes.series.Series(data, index=index) + + return cls(series._apply_unary_op(ops.geo_st_geogfromtext_op)) + @classmethod def from_xy(cls, x, y, index=None, session=None, **kwargs) -> GeoSeries: # TODO: if either x or y is local and the other is remote. Use the @@ -75,3 +82,8 @@ def from_xy(cls, x, y, index=None, session=None, **kwargs) -> GeoSeries: series_y = bigframes.series.Series(y, index=index, session=session, **kwargs) return cls(series_x._apply_binary_op(series_y, ops.geo_st_geogpoint_op)) + + def to_wkt(self: GeoSeries) -> bigframes.series.Series: + series = self._apply_unary_op(ops.geo_st_astext_op) + series.name = None + return series diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 90df6f9539..658818b261 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -25,7 +25,6 @@ import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd -import sklearn.metrics as sklearn_metrics # type: ignore from bigframes.ml import utils import bigframes.pandas as bpd @@ -176,9 +175,9 @@ def auc( ) -> float: x_series, y_series = utils.batch_convert_to_series(x, y) - # TODO(b/286410053) Support ML exceptions and error handling. - auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas()) - return auc + x_pandas = x_series.to_pandas() + y_pandas = y_series.to_pandas() + return vendored_metrics_ranking.auc(x_pandas, y_pandas) auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index f2bc1ecf85..7e6f1f793c 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -87,6 +87,8 @@ ) from bigframes.operations.geo_ops import ( geo_area_op, + geo_st_astext_op, + geo_st_geogfromtext_op, geo_st_geogpoint_op, geo_x_op, geo_y_op, @@ -182,6 +184,7 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + timedelta_floor_op, timestamp_add_op, timestamp_sub_op, ToTimedeltaOp, @@ -257,6 +260,7 @@ "second_op", "normalize_op", # Timedelta ops + "timedelta_floor_op", "timestamp_add_op", "timestamp_sub_op", "ToTimedeltaOp", @@ -353,10 +357,12 @@ "euclidean_distance_op", "manhattan_distance_op", # Geo ops - "geo_x_op", - "geo_y_op", "geo_area_op", + "geo_st_astext_op", + "geo_st_geogfromtext_op", "geo_st_geogpoint_op", + "geo_x_op", + "geo_y_op", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 5f32cb980a..e9d102b42d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -484,6 +484,25 @@ class DiffOp(UnaryWindowOp): def skips_nulls(self): return False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if dtypes.is_datetime_like(input_types[0]): + return dtypes.TIMEDELTA_DTYPE + return super().output_type(*input_types) + + +@dataclasses.dataclass(frozen=True) +class TimeSeriesDiffOp(UnaryWindowOp): + periods: int + + @property + def skips_nulls(self): + return False + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if dtypes.is_datetime_like(input_types[0]): + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"expect datetime-like types, but got {input_types[0]}") + @dataclasses.dataclass(frozen=True) class AllOp(UnaryAggregateOp): diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index e97515d42b..24ff315ad5 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -278,6 +278,7 @@ def image_blur( *, dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, + max_batching_rows: int = 10000, ) -> bigframes.series.Series: """Blurs images. @@ -288,6 +289,7 @@ def image_blur( ksize (tuple(int, int)): Kernel size. dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + max_batching_rows (int, default 10,000): Max number of rows per batch send to cloud run to execute the function. Returns: BigFrames Blob Series @@ -302,6 +304,7 @@ def image_blur( blob_func.image_blur_to_bytes_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() df["ksize_x"], df["ksize_y"] = ksize @@ -322,6 +325,7 @@ def image_blur( blob_func.image_blur_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() dst_rt = dst.blob._get_runtime_json_str(mode="RW") @@ -342,6 +346,7 @@ def image_resize( fy: float = 0.0, dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, + max_batching_rows: int = 10000, ): """Resize images. @@ -354,6 +359,7 @@ def image_resize( fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size. dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + max_batching_rows (int, default 10,000): Max number of rows per batch send to cloud run to execute the function. Returns: BigFrames Blob Series @@ -375,6 +381,7 @@ def image_resize( blob_func.image_resize_to_bytes_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() df["dsize_x"], df["dsizye_y"] = dsize @@ -396,6 +403,7 @@ def image_resize( blob_func.image_resize_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() dst_rt = dst.blob._get_runtime_json_str(mode="RW") @@ -417,6 +425,7 @@ def image_normalize( norm_type: str = "l2", dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, + max_batching_rows: int = 10000, ) -> bigframes.series.Series: """Normalize images. @@ -429,6 +438,7 @@ def image_normalize( norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax". dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + max_batching_rows (int, default 10,000): Max number of rows per batch send to cloud run to execute the function. Returns: BigFrames Blob Series @@ -443,6 +453,7 @@ def image_normalize( blob_func.image_normalize_to_bytes_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() df["alpha"] = alpha @@ -465,6 +476,7 @@ def image_normalize( blob_func.image_normalize_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() dst_rt = dst.blob._get_runtime_json_str(mode="RW") @@ -480,7 +492,10 @@ def image_normalize( return dst def pdf_extract( - self, *, connection: Optional[str] = None + self, + *, + connection: Optional[str] = None, + max_batching_rows: int = 10000, ) -> bigframes.series.Series: """Extracts and chunks text from PDF URLs and saves the text as arrays of string. @@ -493,6 +508,8 @@ def pdf_extract( connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + max_batching_rows (int, default 10,000): Max number of rows per batch + send to cloud run to execute the function. Returns: bigframes.series.Series: conatins all text from a pdf file @@ -502,14 +519,15 @@ def pdf_extract( connection = self._resolve_connection(connection) - pdf_chunk_udf = blob_func.TransformFunction( + pdf_extract_udf = blob_func.TransformFunction( blob_func.pdf_extract_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() src_rt = self._get_runtime_json_str(mode="R") - res = src_rt.apply(pdf_chunk_udf) + res = src_rt.apply(pdf_extract_udf) return res def pdf_chunk( @@ -518,6 +536,7 @@ def pdf_chunk( connection: Optional[str] = None, chunk_size: int = 1000, overlap_size: int = 200, + max_batching_rows: int = 10000, ) -> bigframes.series.Series: """Extracts and chunks text from PDF URLs and saves the text as arrays of strings. @@ -535,6 +554,8 @@ def pdf_chunk( overlap_size (int, default 200): the number of overlapping characters between consective chunks. The helps to ensure context is perserved across chunk boundaries. + max_batching_rows (int, default 10,000): Max number of rows per batch + send to cloud run to execute the function. Returns: bigframe.series.Series of array[str], where each string is a @@ -557,6 +578,7 @@ def pdf_chunk( blob_func.pdf_chunk_def, session=self._block.session, connection=connection, + max_batching_rows=max_batching_rows, ).udf() src_rt = self._get_runtime_json_str(mode="R") diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 0ae8accd56..04441957e7 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -37,6 +37,22 @@ ), ) + +geo_st_astext_op = base_ops.create_unary_op( + name="geo_st_astext", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.STRING_DTYPE, description="geo-like" + ), +) + + +geo_st_geogfromtext_op = base_ops.create_unary_op( + name="geo_st_geogfromtext", + type_signature=op_typing.FixedOutputType( + dtypes.is_string_like, dtypes.GEO_DTYPE, description="string-like" + ), +) + geo_st_geogpoint_op = base_ops.create_binary_op( name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() ) diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 1daacf4e6b..c9ce633cae 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -31,7 +31,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return input_type @@ -46,7 +46,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return pd.ArrowDtype( @@ -63,7 +63,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return pd.ArrowDtype( @@ -79,7 +79,7 @@ def output_type(self, *input_types): input_type = input_types[0] if input_type != dtypes.STRING_DTYPE: raise TypeError( - "Input type must be an valid JSON-formatted string type." + "Input type must be a valid JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.JSON_DTYPE @@ -93,7 +93,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE @@ -109,7 +109,7 @@ def output_type(self, *input_types): right_type = input_types[1] if not dtypes.is_json_like(left_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {left_type}" ) if not dtypes.is_json_encoding_type(right_type): @@ -130,7 +130,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index 5b738c0bb5..8505fd1607 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -15,7 +15,6 @@ import dataclasses import typing -from bigframes import dtypes from bigframes.operations import base_ops @@ -31,17 +30,10 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -55,17 +47,10 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -79,14 +64,7 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 689966e21b..364154f728 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -36,7 +36,26 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) -class TimestampAdd(base_ops.BinaryOp): +class TimedeltaFloorOp(base_ops.UnaryOp): + """Floors the numeric value to the nearest integer and use it to represent a timedelta. + + This operator is only meant to be used during expression tree rewrites. Do not use it anywhere else! + """ + + name: typing.ClassVar[str] = "timedelta_floor" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + input_type = input_types[0] + if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"unsupported type: {input_type}") + + +timedelta_floor_op = TimedeltaFloorOp() + + +@dataclasses.dataclass(frozen=True) +class TimestampAddOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_add" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -57,10 +76,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_add_op = TimestampAdd() +timestamp_add_op = TimestampAddOp() -class TimestampSub(base_ops.BinaryOp): +class TimestampSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -76,4 +95,4 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_sub_op = TimestampSub() +timestamp_sub_op = TimestampSubOp() diff --git a/bigframes/series.py b/bigframes/series.py index fe2d1aae0e..5a84dee32f 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1545,9 +1545,12 @@ def apply( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended to + # be an array, reconstruct the array from the string assuming it is a + # json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1585,9 +1588,12 @@ def combine( other, ops.BinaryRemoteFunctionOp(func=func) ) - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended to + # be an array, reconstruct the array from the string assuming it is a + # json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1812,7 +1818,9 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: + if copy is False: + raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) diff --git a/bigframes/version.py b/bigframes/version.py index 27dfb23603..762deda9ff 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.37.0" +__version__ = "1.38.0" diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index 4792c4fe27..ffd772e7b4 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -56,7 +56,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/arwas/src/bigframes3/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" ] } @@ -103,11 +103,11 @@ { "data": { "text/plain": [ - "171 POINT (-95.50742 42.39186)\n", - "219 POINT (-105.42894 37.27755)\n", - "402 POINT (-93.34905 32.10121)\n", - "526 POINT (-84.60469 43.29233)\n", - "677 POINT (-89.5681 37.04779)\n", + "78 POINT (-95.84727 44.4092)\n", + "130 POINT (-94.90431 41.67918)\n", + "544 POINT (-95.85272 40.38739)\n", + "995 POINT (-101.83333 47.30715)\n", + "1036 POINT (-88.36343 37.20952)\n", "Name: int_point_geom, dtype: geometry" ] }, @@ -125,7 +125,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Convert the five geo points to `bigframes.gopandas.GeoSeries`" + "### 2. Convert the five geo points to `GeoSeries`" ] }, { @@ -136,11 +136,11 @@ { "data": { "text/plain": [ - "0 POINT (-95.50742 42.39186)\n", - "1 POINT (-105.42894 37.27755)\n", - "2 POINT (-93.34905 32.10121)\n", - "3 POINT (-84.60469 43.29233)\n", - "4 POINT (-89.5681 37.04779)\n", + "0 POINT (-95.84727 44.4092)\n", + "1 POINT (-94.90431 41.67918)\n", + "2 POINT (-95.85272 40.38739)\n", + "3 POINT (-101.83333 47.30715)\n", + "4 POINT (-88.36343 37.20952)\n", "dtype: geometry" ] }, @@ -160,7 +160,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Retrieve the x (longitude) and y (latitude) from the GeoSeries with `.x` and `.y`." + "### 3. Retrieve the x (longitude) and y (latitude) from the GeoSeries with `.x` and `.y`." ] }, { @@ -185,11 +185,11 @@ { "data": { "text/plain": [ - "0 -95.507421\n", - "1 -105.42894\n", - "2 -93.34905\n", - "3 -84.60469\n", - "4 -89.568097\n", + "0 -95.847268\n", + "1 -94.904312\n", + "2 -95.852721\n", + "3 -101.833328\n", + "4 -88.363426\n", "dtype: Float64" ] }, @@ -217,11 +217,11 @@ { "data": { "text/plain": [ - "0 42.39186\n", - "1 37.277547\n", - "2 32.101213\n", - "3 43.292326\n", - "4 37.047793\n", + "0 44.409195\n", + "1 41.679178\n", + "2 40.387389\n", + "3 47.307147\n", + "4 37.209517\n", "dtype: Float64" ] }, @@ -238,7 +238,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2. Alternatively, use the `.geo` accessor to access GeoSeries methods from a `bigframes.pandas.Series` object." + "### 4. Alternatively, use the `.geo` accessor to access GeoSeries methods from a `bigframes.pandas.Series` object." ] }, { @@ -367,11 +367,11 @@ { "data": { "text/plain": [ - "54 POLYGON ((-93.76575 45.06448, -93.76575 45.064...\n", - "256 POLYGON ((-89.83723 42.68318, -89.83732 42.682...\n", - "266 POLYGON ((-104.19381 39.56523, -104.19464 39.5...\n", - "485 MULTIPOLYGON (((-91.05884 32.17233, -91.05891 ...\n", - "765 POLYGON ((-83.61848 38.1557, -83.61861 38.1554...\n", + "59 POLYGON ((-96.92479 43.43217, -96.92477 43.430...\n", + "132 POLYGON ((-91.95104 40.05078, -91.95105 40.050...\n", + "223 POLYGON ((-84.39719 40.78658, -84.39718 40.783...\n", + "328 POLYGON ((-91.80469 31.48623, -91.80469 31.486...\n", + "396 POLYGON ((-79.87705 40.03683, -79.87688 40.036...\n", "Name: county_geom, dtype: geometry" ] }, @@ -389,7 +389,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2. Convert the geometry collection to `bigframes.gopandas.GeoSeries`" + "### 2. Convert the geometry collection to `GeoSeries`" ] }, { @@ -400,11 +400,11 @@ { "data": { "text/plain": [ - "0 POLYGON ((-93.76575 45.06448, -93.76575 45.064...\n", - "1 POLYGON ((-89.83723 42.68318, -89.83732 42.682...\n", - "2 POLYGON ((-104.19381 39.56523, -104.19464 39.5...\n", - "3 MULTIPOLYGON (((-91.05884 32.17233, -91.05891 ...\n", - "4 POLYGON ((-83.61848 38.1557, -83.61861 38.1554...\n", + "0 POLYGON ((-96.92479 43.43217, -96.92477 43.430...\n", + "1 POLYGON ((-91.95104 40.05078, -91.95105 40.050...\n", + "2 POLYGON ((-84.39719 40.78658, -84.39718 40.783...\n", + "3 POLYGON ((-91.80469 31.48623, -91.80469 31.486...\n", + "4 POLYGON ((-79.87705 40.03683, -79.87688 40.036...\n", "dtype: geometry" ] }, @@ -428,7 +428,7 @@ ] }, "source": [ - "## Note: `bigframes.geopandas.GeoSeries.area` raises NotImplementedError. " + "## Note: `GeoSeries.area` raises NotImplementedError. " ] }, { @@ -442,14 +442,14 @@ "outputs": [ { "ename": "NotImplementedError", - "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.35.0", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.36.0", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", - "File \u001b[0;32m~/src/bigframes3/bigframes/geopandas/geoseries.py:66\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 68\u001b[0m )\n", - "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.35.0" + "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:67\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 69\u001b[0m )\n", + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.36.0" ] } ], @@ -481,11 +481,11 @@ { "data": { "text/plain": [ - "0 1567505274.453911\n", - "1 1511436852.079554\n", - "2 4789800692.948824\n", - "3 1686877416.586061\n", - "4 740944862.916908\n", + "0 1493638545.448335\n", + "1 1321524759.411463\n", + "2 1052436575.522383\n", + "3 1937116615.360128\n", + "4 2065462414.544471\n", "dtype: Float64" ] }, @@ -503,7 +503,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Use `bigframes.geopandas.GeoSeries.from_xy()` to create a GeoSeries of `Point` geometries. " + "## Use `GeoSeries.from_xy()` to create a GeoSeries of `Point` geometries. " ] }, { @@ -521,11 +521,11 @@ { "data": { "text/plain": [ - "0 POINT (-95.50742 42.39186)\n", - "1 POINT (-105.42894 37.27755)\n", - "2 POINT (-93.34905 32.10121)\n", - "3 POINT (-84.60469 43.29233)\n", - "4 POINT (-89.5681 37.04779)\n", + "0 POINT (-95.84727 44.4092)\n", + "1 POINT (-94.90431 41.67918)\n", + "2 POINT (-95.85272 40.38739)\n", + "3 POINT (-101.83333 47.30715)\n", + "4 POINT (-88.36343 37.20952)\n", "dtype: geometry" ] }, @@ -537,6 +537,86 @@ "source": [ "bigframes.geopandas.GeoSeries.from_xy(geo_points.x, geo_points.y)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use `GeoSeries.to_wkt()` to convert geo points from geometry data type to Well-Knonw Text (WKT)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Reuse the `geo_points`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT(-95.8472678 44.4091953)\n", + "1 POINT(-94.9043119 41.679178)\n", + "2 POINT(-95.8527214 40.3873891)\n", + "3 POINT(-101.8333279 47.3071473)\n", + "4 POINT(-88.3634261 37.2095174)\n", + "dtype: string" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geo_to_wkts = bigframes.geopandas.GeoSeries.to_wkt(geo_points)\n", + "geo_to_wkts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use `GeoSeries.from_wkt()` to convert geo points from Well-Knonw Text (WKT) to geometry data type." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Reuse `geo_to_wkts` results from `GeoSeries.to_wkts`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-95.84727 44.4092)\n", + "1 POINT (-94.90431 41.67918)\n", + "2 POINT (-95.85272 40.38739)\n", + "3 POINT (-101.83333 47.30715)\n", + "4 POINT (-88.36343 37.20952)\n", + "dtype: geometry" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wkts_from_geo = bigframes.geopandas.GeoSeries.from_wkt(geo_to_wkts)\n", + "wkts_from_geo" + ] } ], "metadata": { diff --git a/noxfile.py b/noxfile.py index b851bf160d..bffb6ebaa0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -72,7 +72,9 @@ UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] -UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]} +UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.12": ["polars", "scikit-learn"], +} # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search @@ -96,8 +98,13 @@ ] SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] SYSTEM_TEST_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_EXTRAS: List[str] = ["tests"] -SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +SYSTEM_TEST_EXTRAS: List[str] = [] +SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.9": ["tests"], + "3.10": ["tests"], + "3.12": ["tests", "scikit-learn"], + "3.13": ["tests"], +} LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -468,8 +475,7 @@ def cover(session): @nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs for this library.""" - - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -510,7 +516,7 @@ def docs(session): def docfx(session): """Build the docfx yaml files for this library.""" - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( if match.group(1) not in already_installed ] + print(already_installed) + # We use --no-deps to ensure that pre-release versions aren't overwritten # by the version ranges in setup.py. session.install(*deps) diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py new file mode 100644 index 0000000000..1a15790815 --- /dev/null +++ b/samples/snippets/bigquery_modules_test.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_examples() -> None: + # [START bigquery_dataframes_bigquery_methods_struct] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Create a new STRUCT Series with subfields for each column in a DataFrames. + lengths = bbq.struct( + bq_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + lengths.peek() + # 146 {'culmen_length_mm': 51.1, 'culmen_depth_mm': ... + # 278 {'culmen_length_mm': 48.2, 'culmen_depth_mm': ... + # 337 {'culmen_length_mm': 36.4, 'culmen_depth_mm': ... + # 154 {'culmen_length_mm': 46.5, 'culmen_depth_mm': ... + # 185 {'culmen_length_mm': 50.1, 'culmen_depth_mm': ... + # dtype: struct[pyarrow] + # [END bigquery_dataframes_bigquery_methods_struct] + + # [START bigquery_dataframes_bigquery_methods_scalar] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + + # The sql_scalar function can be used to inject SQL syntax that is not supported + # or difficult to express with the bigframes.pandas APIs. + bq_df = bpd.read_gbq(query_or_table) + shortest = bbq.sql_scalar( + "LEAST({0}, {1}, {2})", + columns=[ + bq_df["culmen_depth_mm"], + bq_df["culmen_length_mm"], + bq_df["flipper_length_mm"], + ], + ) + + shortest.peek() + # 0 + # 149 18.9 + # 33 16.3 + # 296 17.2 + # 287 17.0 + # 307 15.0 + # dtype: Float64 + # [END bigquery_dataframes_bigquery_methods_scalar] + assert bq_df is not None + assert lengths is not None + assert shortest is not None diff --git a/samples/snippets/limit_single_timeseries_forecasting_model_test.py b/samples/snippets/limit_single_timeseries_forecasting_model_test.py new file mode 100644 index 0000000000..6a9f14e383 --- /dev/null +++ b/samples/snippets/limit_single_timeseries_forecasting_model_test.py @@ -0,0 +1,64 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_limit_single_timeseries(random_model_id: str) -> None: + your_model_id = random_model_id + + # [START bigquery_dataframes_bqml_limit_forecast_visualize] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date"]).count() + + num_trips.plot.line() + # [END bigquery_dataframes_bqml_limit_forecast_visualize] + + # [START bigquery_dataframes_bqml_limit_forecast_create] + from bigframes.ml import forecasting + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "start_station_id": df["start_station_id"], + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date", "start_station_id"], as_index=False).count() + model = forecasting.ARIMAPlus() + + X = num_trips[["date"]] + y = num_trips[["num_trips"]] + id_col = num_trips[["start_station_id"]] + + model.fit(X, y, id_col=id_col) + + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.nyc_citibike_arima_model", + replace=True, + ) + # [END bigquery_dataframes_bqml_limit_forecast_create] + assert df is not None + assert features is not None + assert num_trips is not None diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 034a266177..6dea10b608 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -19,6 +19,8 @@ from . import publish_api_coverage +pytest.importorskip("sklearn") + @pytest.fixture def api_coverage_df(): diff --git a/setup.py b/setup.py index 4386177a5e..1f6114b634 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ "pyarrow >=10.0.1", "pydata-google-auth >=1.8.2", "requests >=2.27.1", - "scikit-learn >=1.2.2", - "sqlalchemy >=1.4,<3.0dev", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", @@ -77,8 +75,15 @@ "tests": [], # used for local engine, which is only needed for unit tests at present. "polars": ["polars >= 1.7.0"], + "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. - "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"], + "dev": [ + "pytest", + "pytest-mock", + "pre-commit", + "nox", + "google-cloud-testutils", + ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8b7ad892c0..30d5c1c3a7 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,7 +20,6 @@ pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 -sqlalchemy==1.4 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 29234bc4ef..d40d0e0eef 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -251,6 +251,11 @@ def table_id_unique(dataset_id: str): return f"{dataset_id}.{prefixer.create_prefix()}" +@pytest.fixture(scope="function") +def routine_id_unique(dataset_id: str): + return f"{dataset_id}.{prefixer.create_prefix()}" + + @pytest.fixture(scope="session") def scalars_schema(bigquery_client: bigquery.Client): # TODO(swast): Add missing scalar data types such as BIGNUMERIC. diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 54ba0549a0..7363e370bb 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2193,6 +2193,10 @@ def foo(x, y, z): ) ) ) + assert ( + getattr(foo, "bigframes_bigquery_function_output_dtype") + == bigframes.dtypes.STRING_DTYPE + ) # Fails to apply on dataframe with incompatible number of columns with pytest.raises( diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index effadd3b22..8ecb36ecc9 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -12,22 +12,57 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd + import bigframes import bigframes.pandas as bpd -def test_blob_create_from_uri_str(): +def test_blob_create_from_uri_str(bq_connection: str, session: bigframes.Session): bigframes.options.experiments.blob = True - uri_series = bpd.Series( - [ - "gs://bigframes_blob_test/images/img0.jpg", - "gs://bigframes_blob_test/images/img1.jpg", - ] + uris = [ + "gs://bigframes_blob_test/images/img0.jpg", + "gs://bigframes_blob_test/images/img1.jpg", + ] + + uri_series = bpd.Series(uris, session=session) + blob_series = uri_series.str.to_blob(connection=bq_connection) + + pd_blob_df = blob_series.struct.explode().to_pandas() + expected_pd_df = pd.DataFrame( + { + "uri": uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + + pd.testing.assert_frame_equal( + pd_blob_df, expected_pd_df, check_dtype=False, check_index_type=False ) - # TODO: use bq_connection fixture when MMD location capitalization fix is in prod - blob_series = uri_series.str.to_blob(connection="us.bigframes-default-connection") - pd_blob_series = blob_series.to_pandas() - assert len(pd_blob_series) == 2 +def test_blob_create_from_glob_path(bq_connection: str, session: bigframes.Session): + bigframes.options.experiments.blob = True + + blob_df = session.from_glob_path( + "gs://bigframes_blob_test/images/*", connection=bq_connection, name="blob_col" + ) + pd_blob_df = blob_df["blob_col"].struct.explode().to_pandas() + expected_df = pd.DataFrame( + { + "uri": [ + "gs://bigframes_blob_test/images/img0.jpg", + "gs://bigframes_blob_test/images/img1.jpg", + ], + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + + pd.testing.assert_frame_equal( + pd_blob_df, expected_df, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 0dc8960f62..99a017c917 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -14,6 +14,7 @@ import inspect import re +import textwrap import google.api_core.exceptions from google.cloud import bigquery @@ -27,6 +28,7 @@ import bigframes.exceptions from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff +import bigframes.session._io.bigquery from tests.system.utils import assert_pandas_df_equal _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -632,7 +634,6 @@ def add_one(x): )(add_one) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_detects_invalid_function(session, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) with pytest.raises(ValueError) as e: @@ -705,21 +706,133 @@ def square1(x): assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_runs_existing_udf(session): func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") got = func("AURÉLIE") assert got == "aurÉlie" -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_runs_existing_udf_4_params(session): func = session.read_gbq_function("bqutil.fn.cw_instr4") got = func("TestStr123456Str", "Str", 1, 2) assert got == 14 -@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_unique): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING) + RETURNS ARRAY + AS ( + [x, x] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello") + assert got == ["hello", "hello"] + + # Test on a series, assert pandas parity + pd_s = pd.Series(["alpha", "beta", "gamma"]) + bf_s = session.read_pandas(pd_s) + pd_result = pd_s.apply(func) + bf_result = bf_s.apply(func) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_read_gbq_function_runs_existing_udf_2_params_array_output( + session, routine_id_unique +): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y STRING) + RETURNS ARRAY + AS ( + [x, y] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello", "world") + assert got == ["hello", "world"] + + # Test on series, assert pandas parity + pd_df = pd.DataFrame( + {"col0": ["alpha", "beta", "gamma"], "col1": ["delta", "theta", "phi"]} + ) + bf_df = session.read_pandas(pd_df) + pd_result = pd_df["col0"].combine(pd_df["col1"], func) + bf_result = bf_df["col0"].combine(bf_df["col1"], func) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_read_gbq_function_runs_existing_udf_4_params_array_output( + session, routine_id_unique +): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y BOOL, z INT64, w FLOAT64) + RETURNS ARRAY + AS ( + [x, CAST(y AS STRING), CAST(z AS STRING), CAST(w AS STRING)] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello", True, 1, 2.3) + assert got == ["hello", "true", "1", "2.3"] + + # Test on a dataframe, assert pandas parity + pd_df = pd.DataFrame( + { + "col0": ["alpha", "beta", "gamma"], + "col1": [True, False, True], + "col2": [1, 2, 3], + "col3": [4.5, 6, 7.75], + } + ) + bf_df = session.read_pandas(pd_df) + # Simulate the result directly, since the function cannot be applied + # directly on a pandas dataframe with axis=1, as this is a special type of + # function with multiple params supported only on bigframes dataframe. + pd_result = pd.Series( + [ + ["alpha", "true", "1", "4.5"], + ["beta", "false", "2", "6"], + ["gamma", "true", "3", "7.75"], + ] + ) + bf_result = bf_df.apply(func, axis=1) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( @@ -754,6 +867,10 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): assert square.bigframes_remote_function == str(routine.reference) assert square.input_dtypes == (bigframes.dtypes.INT_DTYPE,) assert square.output_dtype == bigframes.dtypes.INT_DTYPE + assert ( + square.bigframes_bigquery_function_output_dtype + == bigframes.dtypes.INT_DTYPE + ) src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} @@ -772,7 +889,6 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_requires_explicit_types( session, bigquery_client, dataset_id ): @@ -863,7 +979,6 @@ def test_read_gbq_function_requires_explicit_types( ), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_respects_python_output_type( request, session_fixture, bigquery_client, dataset_id, array_type, expected_data ): @@ -906,7 +1021,6 @@ def test_read_gbq_function_respects_python_output_type( pytest.param(list[str], id="list-str"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( session, bigquery_client, dataset_id, array_type ): @@ -945,7 +1059,6 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( pytest.param(list[str], id="list-str"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_supported_python_output_type( session, bigquery_client, dataset_id, array_type ): @@ -992,7 +1105,6 @@ def test_df_apply_scalar_func(session, scalars_dfs): ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_multiple_inputs_not_a_row_processor(session): with pytest.raises(ValueError) as context: # The remote function has two args, which cannot be row processed. Throw @@ -1214,20 +1326,19 @@ def should_mask(name: str) -> bool: repr(s.mask(should_mask, "REDACTED")) -@pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index): - gbq_function = f"{dataset_id}.should_mask" - +def test_read_gbq_function_application_repr( + session, routine_id_unique, scalars_df_index +): # This function deliberately has a param with name "name", this is to test # a specific ibis' internal handling of object names session.bqclient.query_and_wait( - f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" + f"CREATE OR REPLACE FUNCTION `{routine_id_unique}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" ) - routine = session.bqclient.get_routine(gbq_function) + routine = session.bqclient.get_routine(routine_id_unique) assert "name" in [arg.name for arg in routine.arguments] # read the function and apply to dataframe - should_mask = session.read_gbq_function(gbq_function) + should_mask = session.read_gbq_function(routine_id_unique) s = scalars_df_index["string_col"] diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index 5951d0b12c..b27009d9d8 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -107,3 +107,58 @@ def test_geo_from_xy(): check_series_type=False, check_index=False, ) + + +def test_geo_from_wkt(): + wkts = [ + "Point(0 1)", + "Point(2 4)", + "Point(5 3)", + "Point(6 8)", + ] + + bf_result = bigframes.geopandas.GeoSeries.from_wkt(wkts).to_pandas() + + pd_result = geopandas.GeoSeries.from_wkt(wkts) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_series_type=False, + check_index=False, + ) + + +def test_geo_to_wkt(): + bf_geo = bigframes.geopandas.GeoSeries( + [ + Point(0, 1), + Point(2, 4), + Point(5, 3), + Point(6, 8), + ] + ) + + pd_geo = geopandas.GeoSeries( + [ + Point(0, 1), + Point(2, 4), + Point(5, 3), + Point(6, 8), + ] + ) + + # Test was failing before using str.replace because the pd_result had extra + # whitespace "POINT (0 1)" while bf_result had none "POINT(0 1)". + # str.replace replaces any encountered whitespaces with none. + bf_result = ( + bf_geo.to_wkt().astype("string[pyarrow]").to_pandas().str.replace(" ", "") + ) + + pd_result = pd_geo.to_wkt().astype("string[pyarrow]").str.replace(" ", "") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_index=False, + ) diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 81e1b2f77f..b80202bdbe 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd import pytest -import sklearn.metrics as sklearn_metrics # type: ignore import bigframes from bigframes.ml import metrics @@ -66,6 +65,7 @@ def test_r2_score_force_finite(session): def test_r2_score_ok_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session): def test_accuracy_score_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): def test_roc_curve_binary_classification_prediction_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): def test_roc_curve_binary_classification_decision_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") # Instead of operating on probabilities, assume a 70% decision threshold # has been applied, and operate on the final output y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45] @@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session): def test_roc_auc_score_returns_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session): def test_confusion_matrix_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 3, 3, 3, 4, 1], @@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session): def test_confusion_matrix_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -603,6 +609,7 @@ def test_recall_score(session): def test_recall_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session): def test_recall_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -673,6 +681,7 @@ def test_precision_score(session): def test_precision_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session): def test_precision_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -752,6 +762,7 @@ def test_f1_score(session): def test_f1_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session): def test_f1_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 936becff76..ca83604dd5 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -448,3 +448,15 @@ def test_timestamp_diff_literal_sub_series(scalars_dfs, column, value): expected_result = value - pd_series assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize("column", ["timestamp_col", "datetime_col"]) +def test_timestamp_series_diff_agg(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = bf_series.diff().to_pandas() + + expected_result = pd_series.diff() + assert_series_equal(actual_result, expected_result) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index f84ee811a3..b07213f943 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -509,7 +509,7 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): df_out = df_out.sort_values("rowindex_2").reset_index(drop=True) utils.convert_pandas_dtypes(df_out, bytes_col=False) - # pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary + # pd.read_gbq interprets bytes_col as object, reconvert to pyarrow binary df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary())) expected = scalars_pandas_df.copy() expected.index.name = index_col diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 4b4264e33c..da78432cdb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -829,3 +829,18 @@ def test_to_timedelta_with_bf_series_invalid_unit(session, unit): @pytest.mark.parametrize("input", [1, 1.2, "1s"]) def test_to_timedelta_non_bf_series(input): assert bpd.to_timedelta(input) == pd.to_timedelta(input) + + +def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): + bf_df, pd_df = scalars_dfs + bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us") + pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us") + + actual_result = ( + bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]") + ) + + expected_result = pd.to_timedelta(pd_series, unit="s") + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 00f47c754e..2daa7dd825 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): ), ) def test_series_interpolate(method): + pytest.importorskip("scipy") + values = [None, 1, 2, None, None, 16, None] index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] pd_series = pd.Series(values, index) diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index 413a694680..d377fb4d49 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -66,6 +66,12 @@ def test_supported_types_correspond(): ibis_types_from_bigquery = { third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + # TODO(b/284515241): ARRAY is the only exception because it is supported + # as an output type of the BQ routine in the read_gbq_function path but + # not in the remote function path. Remove this handline once BQ remote + # functions supports ARRAY output and the bigframes remote functions + # utilizes that to support array output. + if tk != "ARRAY" } assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py index 00a51ccfe9..dd2ceff143 100644 --- a/tests/unit/ml/test_api_primitives.py +++ b/tests/unit/ml/test_api_primitives.py @@ -13,8 +13,6 @@ # limitations under the License. import pytest -import sklearn.decomposition as sklearn_decomposition # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore import bigframes.ml.decomposition import bigframes.ml.linear_model @@ -35,8 +33,9 @@ def test_base_estimator_repr(): assert pca_estimator.__repr__() == "PCA(n_components=7)" -@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn") def test_base_estimator_repr_matches_sklearn(): + sklearn_decomposition = pytest.importorskip("sklearn.decomposition") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") estimator = bigframes.ml.linear_model.LinearRegression() sklearn_estimator = sklearn_linear_model.LinearRegression() assert estimator.__repr__() == sklearn_estimator.__repr__() diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 395296f3e4..450ce8d6ee 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -15,8 +15,6 @@ from google.cloud import bigquery import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer @@ -119,6 +117,8 @@ def test_columntransformer_repr(): def test_columntransformer_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_column_transformer = compose.ColumnTransformer( [ ( diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py index ed5c621b1d..beebb9f282 100644 --- a/tests/unit/ml/test_pipeline.py +++ b/tests/unit/ml/test_pipeline.py @@ -13,10 +13,6 @@ # limitations under the License. import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore -import sklearn.pipeline as sklearn_pipeline # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing @@ -57,8 +53,11 @@ def test_pipeline_repr(): ) -@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn") def test_pipeline_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") + sklearn_pipeline = pytest.importorskip("sklearn.pipeline") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_pl = pipeline.Pipeline( [ ( diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index b8a7af437b..b7040d4321 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -100,7 +100,7 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: In case of geographic coordinates, it is assumed that longitude is captured by x coordinates and latitude by y. - **Examples:** + **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.geopandas @@ -122,7 +122,7 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: index (array-like or Index, optional): The index for the GeoSeries. If not given and all coordinate - inputs are Series with an equal index, that index is used.. + inputs are Series with an equal index, that index is used. **kwargs: Additional arguments passed to the Series constructor, e.g. `name`. @@ -132,3 +132,70 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: A GeoSeries of Point geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_wkt(cls, data, index=None) -> bigframes.geopandas.GeoSeries: + """ + Alternate constructor to create a GeoSeries from a list or array of + WKT objects. + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None + + >>> wkts = [ + ... 'POINT (1 1)', + ... 'POINT (2 2)', + ... 'POINT (3 3)', + ... ] + >>> s = bigframes.geopandas.GeoSeries.from_wkt(wkts) + >>> s + 0 POINT (1 1) + 1 POINT (2 2) + 2 POINT (3 3) + dtype: geometry + + Args: + data (array-like): + Series, list, or array of WKT objects. + + index (array-like or Index, optional): + The index for the GeoSeries. + + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of geometries. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_wkt(self) -> bigframes.series.Series: + """ + Convert GeoSeries geometries to WKT + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.geopandas + >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None + + >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) + >>> s + 0 POINT (1 1) + 1 POINT (2 2) + 2 POINT (3 3) + dtype: geometry + + >>> s.to_wkt() + 0 POINT(1 1) + 1 POINT(2 2) + 2 POINT(3 3) + dtype: string + + Returns: + bigframes.series.Series: + WKT representations of the geometries. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/relations.py b/third_party/bigframes_vendored/ibis/expr/operations/relations.py index fccd4cbf58..ef45fdfc0d 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/relations.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/relations.py @@ -10,11 +10,7 @@ from typing import Annotated, Any, Literal, Optional, TypeVar from bigframes_vendored.ibis.common.annotations import attribute -from bigframes_vendored.ibis.common.collections import ( - ConflictingValuesError, - FrozenDict, - FrozenOrderedDict, -) +from bigframes_vendored.ibis.common.collections import FrozenDict, FrozenOrderedDict from bigframes_vendored.ibis.common.exceptions import ( IbisTypeError, IntegrityError, @@ -342,20 +338,6 @@ class Set(Relation): values = FrozenOrderedDict() def __init__(self, left, right, **kwargs): - err_msg = "Table schemas must be equal for set operations." - try: - missing_from_left = right.schema - left.schema - missing_from_right = left.schema - right.schema - except ConflictingValuesError as e: - raise RelationError(err_msg + "\n" + str(e)) from e - if missing_from_left or missing_from_right: - msgs = [err_msg] - if missing_from_left: - msgs.append(f"Columns missing from the left:\n{missing_from_left}.") - if missing_from_right: - msgs.append(f"Columns missing from the right:\n{missing_from_right}.") - raise RelationError("\n".join(msgs)) - if left.schema.names != right.schema.names: # rewrite so that both sides have the columns in the same order making it # easier for the backends to implement set operations @@ -365,7 +347,15 @@ def __init__(self, left, right, **kwargs): @attribute def schema(self): - return self.left.schema + dtypes = ( + dt.higher_precedence(ltype, rtype) + for ltype, rtype in zip( + self.left.schema.values(), self.right.schema.values() + ) + ) + return Schema.from_tuples( + (name, coltype) for name, coltype in zip(self.left.schema.names, dtypes) + ) @public diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f5aa23d00b..e296dcb9f6 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -7179,7 +7179,7 @@ def __len__(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def __array__(self): + def __array__(self, dtype=None, copy: Optional[bool] = None): """ Returns the rows as NumPy array. @@ -7210,6 +7210,8 @@ def __array__(self): dtype (str or numpy.dtype, optional): The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy (bool or None, optional): + Whether to copy the data, False is not supported. Returns: numpy.ndarray: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 57f7dfbb79..5e6f546d09 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -5941,7 +5941,7 @@ def size(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: """ Returns the values as NumPy array. @@ -5965,6 +5965,8 @@ def __array__(self, dtype=None) -> numpy.ndarray: dtype (str or numpy.dtype, optional): The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy (bool or None, optional): + Whether to copy the data, False is not supported. Returns: numpy.ndarray: diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 7b97526de2..9262ffbd3d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -16,6 +16,8 @@ # Michal Karbownik # License: BSD 3 clause +import numpy as np + from bigframes import constants @@ -60,7 +62,23 @@ def auc(x, y) -> float: Returns: float: Area Under the Curve. """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + if len(x) < 2: + raise ValueError( + f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}" + ) + + if x.is_monotonic_decreasing: + d = -1 + elif x.is_monotonic_increasing: + d = 1 + else: + raise ValueError(f"x is neither increasing nor decreasing : {x}.") + + if hasattr(np, "trapezoid"): + # new in numpy 2.0 + return d * np.trapezoid(y, x) + # np.trapz has been deprecated in 2.0 + return d * np.trapz(y, x) # type: ignore def roc_auc_score(y_true, y_score) -> float: diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py index 6af33f7569..5c9ca1e9c3 100644 --- a/third_party/bigframes_vendored/tpch/queries/q9.py +++ b/third_party/bigframes_vendored/tpch/queries/q9.py @@ -33,13 +33,17 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) q_final = ( - part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") - .merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") - .merge( + part.merge( lineitem, - left_on=["P_PARTKEY", "PS_SUPPKEY"], - right_on=["L_PARTKEY", "L_SUPPKEY"], + left_on="P_PARTKEY", + right_on="L_PARTKEY", + ) + .merge( + partsupp, + left_on=["L_SUPPKEY", "L_PARTKEY"], + right_on=["PS_SUPPKEY", "PS_PARTKEY"], ) + .merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY") .merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") .merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY") ) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 27dfb23603..762deda9ff 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.37.0" +__version__ = "1.38.0"