From 3a4a9de1f797557dfbbb7317eb3754e16131ddbf Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 11 Sep 2024 13:35:17 -0700 Subject: [PATCH 01/18] refactor: Simplify join node definition (#966) --- bigframes/core/__init__.py | 149 ++---------- bigframes/core/blocks.py | 311 ++++++++++++------------ bigframes/core/compile/compiled.py | 2 +- bigframes/core/compile/compiler.py | 6 +- bigframes/core/compile/single_column.py | 35 ++- bigframes/core/guid.py | 2 +- bigframes/core/identifiers.py | 26 ++ bigframes/core/join_def.py | 39 +-- bigframes/core/nodes.py | 30 +-- bigframes/core/schema.py | 3 + 10 files changed, 257 insertions(+), 346 deletions(-) create mode 100644 bigframes/core/identifiers.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index f65509e5b7..b4074dd94f 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -17,9 +17,8 @@ import datetime import functools import io -import itertools import typing -from typing import Iterable, Optional, Sequence +from typing import Iterable, Optional, Sequence, Tuple import warnings import google.cloud.bigquery @@ -191,19 +190,14 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) - def project_to_id(self, expression: ex.Expression, output_id: str): + def compute_values(self, assignments: Sequence[Tuple[ex.Expression, str]]): return ArrayValue( - nodes.ProjectionNode( - child=self.node, - assignments=( - ( - expression, - output_id, - ), - ), - ) + nodes.ProjectionNode(child=self.node, assignments=tuple(assignments)) ) + def project_to_id(self, expression: ex.Expression, output_id: str): + return self.compute_values(((expression, output_id),)) + def assign(self, source_id: str, destination_id: str) -> ArrayValue: if destination_id in self.column_ids: # Mutate case exprs = [ @@ -341,124 +335,33 @@ def _reproject_to_table(self) -> ArrayValue: ) ) - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - join_side: typing.Literal["left", "right"] = "left", - ) -> ArrayValue: - """ - Unpivot ArrayValue columns. - - Args: - row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. - unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. - passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. - index_col_id (str): The column id to be used for the row labels. - - Returns: - ArrayValue: The unpivoted ArrayValue - """ - # There will be N labels, used to disambiguate which of N source columns produced each output row - explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - labels_array = self._create_unpivot_labels_array( - row_labels, index_col_ids, explode_offsets_id - ) - - # Unpivot creates N output rows for each input row, labels disambiguate these N rows - joined_array = self._cross_join_w_labels(labels_array, join_side) - - # Build the output rows as a case statment that selects between the N input columns - unpivot_exprs = [] - # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index - for col_id, input_ids in unpivot_columns: - # row explode offset used to choose the input column - # we use offset instead of label as labels are not necessarily unique - cases = itertools.chain( - *( - ( - ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), - ex.free_var(id_or_null) - if (id_or_null is not None) - else ex.const(None), - ) - for i, id_or_null in enumerate(input_ids) - ) - ) - col_expr = ops.case_when_op.as_expr(*cases) - unpivot_exprs.append((col_expr, col_id)) - - unpivot_col_ids = [id for id, _ in unpivot_columns] - return ArrayValue( - nodes.ProjectionNode( - child=joined_array.node, - assignments=(*unpivot_exprs,), - ) - ).select_columns([*index_col_ids, *unpivot_col_ids, *passthrough_columns]) - - def _cross_join_w_labels( - self, labels_array: ArrayValue, join_side: typing.Literal["left", "right"] - ) -> ArrayValue: - """ - Convert each row in self to N rows, one for each label in labels array. - """ - table_join_side = ( - join_def.JoinSide.LEFT if join_side == "left" else join_def.JoinSide.RIGHT - ) - labels_join_side = table_join_side.inverse() - labels_mappings = tuple( - join_def.JoinColumnMapping(labels_join_side, id, id) - for id in labels_array.schema.names - ) - table_mappings = tuple( - join_def.JoinColumnMapping(table_join_side, id, id) - for id in self.schema.names - ) - join = join_def.JoinDefinition( - conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross" - ) - if join_side == "left": - joined_array = self.relational_join(labels_array, join_def=join) - else: - joined_array = labels_array.relational_join(self, join_def=join) - return joined_array - - def _create_unpivot_labels_array( - self, - former_column_labels: typing.Sequence[typing.Hashable], - col_ids: typing.Sequence[str], - offsets_id: str, - ) -> ArrayValue: - """Create an ArrayValue from a list of label tuples.""" - rows = [] - for row_offset in range(len(former_column_labels)): - row_label = former_column_labels[row_offset] - row_label = (row_label,) if not isinstance(row_label, tuple) else row_label - row = { - col_ids[i]: (row_label[i] if pandas.notnull(row_label[i]) else None) - for i in range(len(col_ids)) - } - row[offsets_id] = row_offset - rows.append(row) - - return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) - def relational_join( self, other: ArrayValue, - join_def: join_def.JoinDefinition, - ) -> ArrayValue: + conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), + type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", + ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: join_node = nodes.JoinNode( left_child=self.node, right_child=other.node, - join=join_def, + conditions=conditions, + type=type, ) - return ArrayValue(join_node) + # Maps input ids to output ids for caller convenience + l_size = len(self.node.schema) + l_mapping = { + lcol: ocol + for lcol, ocol in zip( + self.node.schema.names, join_node.schema.names[:l_size] + ) + } + r_mapping = { + rcol: ocol + for rcol, ocol in zip( + other.node.schema.names, join_node.schema.names[l_size:] + ) + } + return ArrayValue(join_node), (l_mapping, r_mapping) def try_align_as_projection( self, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 4db171ec70..42b1a0aeb0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -44,6 +44,7 @@ import bigframes.core.expression as ex import bigframes.core.expression as scalars import bigframes.core.guid as guid +import bigframes.core.identifiers import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering import bigframes.core.schema as bf_schema @@ -1050,7 +1051,6 @@ def aggregate_all_and_stack( operation: typing.Union[agg_ops.UnaryAggregateOp, agg_ops.NullaryAggregateOp], *, axis: int | str = 0, - value_col_id: str = "values", dropna: bool = True, ) -> Block: axis_n = utils.get_axis_number(axis) @@ -1080,15 +1080,18 @@ def aggregate_all_and_stack( # TODO: Allow to promote identity/total_order columns instead for better perf offset_col = guid.generate_guid() expr_with_offsets = self.expr.promote_offsets(offset_col) - stacked_expr = expr_with_offsets.unpivot( - row_labels=self.column_labels.to_list(), - index_col_ids=[guid.generate_guid()], - unpivot_columns=[(value_col_id, tuple(self.value_columns))], + stacked_expr, (_, value_col_ids, passthrough_cols,) = unpivot( + expr_with_offsets, + row_labels=self.column_labels, + unpivot_columns=[tuple(self.value_columns)], passthrough_columns=[*self.index_columns, offset_col], ) + # these corresponed to passthrough_columns provided to unpivot + index_cols = passthrough_cols[:-1] + og_offset_col = passthrough_cols[-1] index_aggregations = [ (ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.free_var(col_id)), col_id) - for col_id in [*self.index_columns] + for col_id in index_cols ] # TODO: may need add NullaryAggregation in main_aggregation # when agg add support for axis=1, needed for agg("size", axis=1) @@ -1096,17 +1099,18 @@ def aggregate_all_and_stack( operation, agg_ops.UnaryAggregateOp ), f"Expected a unary operation, but got {operation}. Please report this error and how you got here to the BigQuery DataFrames team (bit.ly/bigframes-feedback)." main_aggregation = ( - ex.UnaryAggregation(operation, ex.free_var(value_col_id)), - value_col_id, + ex.UnaryAggregation(operation, ex.free_var(value_col_ids[0])), + value_col_ids[0], ) + # Drop row identity after aggregating over it result_expr = stacked_expr.aggregate( [*index_aggregations, main_aggregation], - by_column_ids=[offset_col], + by_column_ids=[og_offset_col], dropna=dropna, - ) + ).drop_columns([og_offset_col]) return Block( - result_expr.drop_columns([offset_col]), - self.index_columns, + result_expr, + index_columns=index_cols, column_labels=[None], index_labels=self.index.names, ) @@ -1318,8 +1322,7 @@ def summarize( ], ): """Get a list of stats as a deferred block object.""" - label_col_id = guid.generate_guid() - labels = [stat.name for stat in stats] + labels = pd.Index([stat.name for stat in stats]) aggregations = [ ( ex.UnaryAggregation(stat, ex.free_var(col_id)) @@ -1331,18 +1334,17 @@ def summarize( for col_id in column_ids ] columns = [ - (col_id, tuple(f"{col_id}-{stat.name}" for stat in stats)) - for col_id in column_ids + (tuple(f"{col_id}-{stat.name}" for stat in stats)) for col_id in column_ids ] - expr = self.expr.aggregate(aggregations).unpivot( + expr, (index_cols, _, _) = unpivot( + self.expr.aggregate(aggregations), labels, unpivot_columns=tuple(columns), - index_col_ids=tuple([label_col_id]), ) return Block( expr, column_labels=self._get_labels_for_columns(column_ids), - index_columns=[label_col_id], + index_columns=index_cols, ) def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): @@ -1368,23 +1370,17 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): ] expr = self.expr.aggregate(aggregations) - index_col_ids = [ - guid.generate_guid() for i in range(self.column_labels.nlevels) - ] input_count = len(self.value_columns) unpivot_columns = tuple( - ( - guid.generate_guid(), - tuple(expr.column_ids[input_count * i : input_count * (i + 1)]), - ) + tuple(expr.column_ids[input_count * i : input_count * (i + 1)]) for i in range(input_count) ) labels = self._get_labels_for_columns(self.value_columns) # TODO(b/340896143): fix type error - expr = expr.unpivot( - row_labels=labels, # type: ignore - index_col_ids=index_col_ids, + expr, (index_col_ids, _, _) = unpivot( + expr, + row_labels=labels, unpivot_columns=unpivot_columns, ) @@ -1604,7 +1600,7 @@ def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: Block( expr, index_columns=self.index_columns, - column_labels=self.column_labels.insert(0, label), + column_labels=self.column_labels.insert(len(self.column_labels), label), index_labels=self._index_labels, ), result_id, @@ -1722,8 +1718,6 @@ def stack(self, how="left", levels: int = 1): col_labels, row_labels = utils.split_index(self.column_labels, levels=levels) row_labels = row_labels.drop_duplicates() - row_label_tuples = utils.index_as_tuples(row_labels) - if col_labels is None: result_index: pd.Index = pd.Index([None]) result_col_labels: Sequence[Tuple] = list([()]) @@ -1737,26 +1731,24 @@ def stack(self, how="left", levels: int = 1): result_col_labels = utils.index_as_tuples(result_index) # Get matching columns - unpivot_columns: List[Tuple[str, List[str]]] = [] + unpivot_columns: List[Tuple[Optional[str], ...]] = [] for val in result_col_labels: - col_id = guid.generate_guid("unpivot_") - input_columns, dtype = self._create_stack_column(val, row_label_tuples) - unpivot_columns.append((col_id, input_columns)) + input_columns, _ = self._create_stack_column(val, row_labels) + unpivot_columns.append(input_columns) - added_index_columns = [guid.generate_guid() for _ in range(row_labels.nlevels)] - unpivot_expr = self._expr.unpivot( - row_labels=row_label_tuples, + unpivot_expr, (added_index_columns, _, passthrough_cols) = unpivot( + self._expr, + row_labels=row_labels, passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, - index_col_ids=added_index_columns, join_side=how, ) new_index_level_names = self.column_labels.names[-levels:] if how == "left": - index_columns = [*self.index_columns, *added_index_columns] + index_columns = [*passthrough_cols, *added_index_columns] index_labels = [*self._index_labels, *new_index_level_names] else: - index_columns = [*added_index_columns, *self.index_columns] + index_columns = [*added_index_columns, *passthrough_cols] index_labels = [*new_index_level_names, *self._index_labels] return Block( @@ -1780,18 +1772,16 @@ def melt( Arguments correspond to pandas.melt arguments. """ # TODO: Implement col_level and ignore_index - unpivot_col_id = guid.generate_guid() - var_col_ids = tuple([guid.generate_guid() for _ in var_names]) - # single unpivot col - unpivot_col = (unpivot_col_id, tuple(value_vars)) - value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] + value_labels: pd.Index = pd.Index( + [self.col_id_to_label[col_id] for col_id in value_vars] + ) id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] - unpivot_expr = self._expr.unpivot( + unpivot_expr, (var_col_ids, unpivot_out, passthrough_cols) = unpivot( + self._expr, row_labels=value_labels, passthrough_columns=id_vars, - unpivot_columns=(unpivot_col,), - index_col_ids=var_col_ids, + unpivot_columns=(tuple(value_vars),), # single unpivot col join_side="right", ) @@ -1804,7 +1794,7 @@ def melt( # Need to reorder to get id_vars before var_col and unpivot_col unpivot_expr = unpivot_expr.select_columns( - [*index_cols, *id_vars, *var_col_ids, unpivot_col_id] + [*index_cols, *passthrough_cols, *var_col_ids, *unpivot_out] ) return Block( @@ -1859,6 +1849,7 @@ def transpose( value_vars=block.value_columns, create_offsets_index=False, ) + row_offset = stacked_block.value_columns[0] col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2] col_offset = stacked_block.value_columns[-2] # disambiguator we created earlier cell_values = stacked_block.value_columns[-1] @@ -1867,7 +1858,7 @@ def transpose( [*col_labels, col_offset] ) # col index is now row index result = stacked_block.pivot( - columns=[offsets], + columns=[row_offset], values=[cell_values], columns_unique_values=tuple(range(original_row_count)), ) @@ -1879,12 +1870,10 @@ def transpose( .with_transpose_cache(self) ) - def _create_stack_column( - self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] - ): + def _create_stack_column(self, col_label: typing.Tuple, stack_labels: pd.Index): dtype = None input_columns: list[Optional[str]] = [] - for uvalue in stack_labels: + for uvalue in utils.index_as_tuples(stack_labels): label_to_match = (*col_label, *uvalue) label_to_match = ( label_to_match[0] if len(label_to_match) == 1 else label_to_match @@ -2013,38 +2002,16 @@ def merge( sort: bool, suffixes: tuple[str, str] = ("_x", "_y"), ) -> Block: - left_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.LEFT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in self.expr.column_ids - ] - right_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.RIGHT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in other.expr.column_ids - ] - - join_def = join_defs.JoinDefinition( - conditions=tuple( - join_defs.JoinCondition(left, right) - for left, right in zip(left_join_ids, right_join_ids) - ), - mappings=(*left_mappings, *right_mappings), - type=how, + conditions = tuple( + (lid, rid) for lid, rid in zip(left_join_ids, right_join_ids) + ) + joined_expr, (get_column_left, get_column_right) = self.expr.relational_join( + other.expr, type=how, conditions=conditions ) - joined_expr = self.expr.relational_join(other.expr, join_def=join_def) result_columns = [] matching_join_labels = [] coalesced_ids = [] - get_column_left = join_def.get_left_mapping() - get_column_right = join_def.get_right_mapping() for left_id, right_id in zip(left_join_ids, right_join_ids): coalesced_id = guid.generate_guid() joined_expr = joined_expr.project_to_id( @@ -2748,34 +2715,10 @@ def join_with_single_row( left_expr = left.expr # ignore index columns by dropping them right_expr = single_row_block.expr.select_columns(single_row_block.value_columns) - left_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.LEFT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in left_expr.column_ids - ] - right_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.RIGHT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in right_expr.column_ids # skip index column - ] - - join_def = join_defs.JoinDefinition( - conditions=(), - mappings=(*left_mappings, *right_mappings), - type="cross", - ) - combined_expr = left_expr.relational_join( + combined_expr, (get_column_left, get_column_right) = left_expr.relational_join( right_expr, - join_def=join_def, + type="cross", ) - get_column_left = join_def.get_left_mapping() - get_column_right = join_def.get_right_mapping() # Drop original indices from each side. and used the coalesced combination generated by the join. index_cols_post_join = [get_column_left[id] for id in left.index_columns] @@ -2800,38 +2743,15 @@ def join_mono_indexed( ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left.expr right_expr = right.expr - left_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.LEFT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in left_expr.column_ids - ] - right_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.RIGHT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in right_expr.column_ids - ] - join_def = join_defs.JoinDefinition( + combined_expr, (get_column_left, get_column_right) = left_expr.relational_join( + right_expr, + type=how, conditions=( join_defs.JoinCondition(left.index_columns[0], right.index_columns[0]), ), - mappings=(*left_mappings, *right_mappings), - type=how, ) - combined_expr = left_expr.relational_join( - right_expr, - join_def=join_def, - ) - - get_column_left = join_def.get_left_mapping() - get_column_right = join_def.get_right_mapping() left_index = get_column_left[left.index_columns[0]] right_index = get_column_right[right.index_columns[0]] # Drop original indices from each side. and used the coalesced combination generated by the join. @@ -2886,39 +2806,15 @@ def join_multi_indexed( left_expr = left.expr right_expr = right.expr - left_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.LEFT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in left_expr.column_ids - ] - right_mappings = [ - join_defs.JoinColumnMapping( - source_table=join_defs.JoinSide.RIGHT, - source_id=id, - destination_id=guid.generate_guid(), - ) - for id in right_expr.column_ids - ] - - join_def = join_defs.JoinDefinition( + combined_expr, (get_column_left, get_column_right) = left_expr.relational_join( + right_expr, + type=how, conditions=tuple( join_defs.JoinCondition(left, right) for left, right in zip(left_join_ids, right_join_ids) ), - mappings=(*left_mappings, *right_mappings), - type=how, ) - combined_expr = left_expr.relational_join( - right_expr, - join_def=join_def, - ) - - get_column_left = join_def.get_left_mapping() - get_column_right = join_def.get_right_mapping() left_ids_post_join = [get_column_left[id] for id in left_join_ids] right_ids_post_join = [get_column_right[id] for id in right_join_ids] # Drop original indices from each side. and used the coalesced combination generated by the join. @@ -3114,3 +3010,94 @@ def _get_block_schema( for label, dtype in zip(block.column_labels, block.dtypes): result[label] = typing.cast(bigframes.dtypes.Dtype, dtype) return result + + +## Unpivot helpers +def unpivot( + array_value: core.ArrayValue, + row_labels: pd.Index, + unpivot_columns: Sequence[Tuple[Optional[str], ...]], + *, + passthrough_columns: typing.Sequence[str] = (), + join_side: Literal["left", "right"] = "left", +) -> Tuple[core.ArrayValue, Tuple[Tuple[str, ...], Tuple[str, ...], Tuple[str, ...]]]: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Sequence of column ids tuples. Each tuple of columns will be combined into a single output column + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + + Returns: + ArrayValue, (index_cols, unpivot_cols, passthrough_cols): The unpivoted ArrayValue and resulting column ids. + """ + # There will be N labels, used to disambiguate which of N source columns produced each output row + labels_array = _pd_index_to_array_value( + session=array_value.session, index=row_labels + ) + + # Unpivot creates N output rows for each input row, labels disambiguate these N rows + # Join_side is necessary to produce desired row ordering + if join_side == "left": + joined_array, (column_mapping, labels_mapping) = array_value.relational_join( + labels_array, type="cross" + ) + else: + joined_array, (labels_mapping, column_mapping) = labels_array.relational_join( + array_value, type="cross" + ) + new_passthrough_cols = [column_mapping[col] for col in passthrough_columns] + # Last column is offsets + index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] + explode_offsets_id = labels_mapping[labels_array.column_ids[-1]] + + # Build the output rows as a case statment that selects between the N input columns + unpivot_exprs: List[Tuple[ex.Expression, str]] = [] + # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index + for input_ids in unpivot_columns: + # row explode offset used to choose the input column + # we use offset instead of label as labels are not necessarily unique + cases = itertools.chain( + *( + ( + ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), + ex.free_var(column_mapping[id_or_null]) + if (id_or_null is not None) + else ex.const(None), + ) + for i, id_or_null in enumerate(input_ids) + ) + ) + col_expr = ops.case_when_op.as_expr(*cases) + unpivot_exprs.append((col_expr, guid.generate_guid())) + + unpivot_col_ids = [id for _, id in unpivot_exprs] + + return joined_array.compute_values(unpivot_exprs).select_columns( + [*index_col_ids, *unpivot_col_ids, *new_passthrough_cols] + ), (tuple(index_col_ids), tuple(unpivot_col_ids), tuple(new_passthrough_cols)) + + +def _pd_index_to_array_value( + session: core.Session, + index: pd.Index, +) -> core.ArrayValue: + """ + Create an ArrayValue from a list of label tuples. + The last column will be row offsets. + """ + rows = [] + labels_as_tuples = utils.index_as_tuples(index) + for row_offset in range(len(index)): + id_gen = bigframes.core.identifiers.standard_identifiers() + row_label = labels_as_tuples[row_offset] + row_label = (row_label,) if not isinstance(row_label, tuple) else row_label + row = {} + for label_part, id in zip(row_label, id_gen): + row[id] = label_part if pd.notnull(label_part) else None + row[next(id_gen)] = row_offset + rows.append(row) + + return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 9a9f598e89..38b8fb50e3 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -791,10 +791,10 @@ def promote_offsets(self, col_id: str) -> OrderedIR: if ordering.is_sequential and (ordering.total_order_col is not None): expr_builder = self.builder() expr_builder.columns = [ + *self.columns, self._compile_expression( ordering.total_order_col.scalar_expression ).name(col_id), - *self.columns, ] return expr_builder.build() # Cannot nest analytic expressions, so reproject to cte first if needed. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 80d5f5a893..950c2c2cc7 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -81,7 +81,8 @@ def compile_join(self, node: nodes.JoinNode, ordered: bool = True): return bigframes.core.compile.single_column.join_by_column_ordered( left=left_ordered, right=right_ordered, - join=node.join, + type=node.type, + conditions=node.conditions, ) else: left_unordered = self.compile_unordered_ir(node.left_child) @@ -89,7 +90,8 @@ def compile_join(self, node: nodes.JoinNode, ordered: bool = True): return bigframes.core.compile.single_column.join_by_column_unordered( left=left_unordered, right=right_unordered, - join=node.join, + type=node.type, + conditions=node.conditions, ) @_compile_node.register diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index 9b621c9c79..26af969b74 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -16,20 +16,23 @@ from __future__ import annotations +from typing import Literal, Tuple + import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import bigframes.core.compile.compiled as compiled import bigframes.core.guid as guids -import bigframes.core.join_def as join_defs +import bigframes.core.identifiers as ids import bigframes.core.ordering as orderings def join_by_column_ordered( left: compiled.OrderedIR, right: compiled.OrderedIR, - join: join_defs.JoinDefinition, + conditions: Tuple[Tuple[str, str], ...], + type: Literal["inner", "outer", "left", "right", "cross"], ) -> compiled.OrderedIR: """Join two expressions by column equality. @@ -48,6 +51,11 @@ def join_by_column_ordered( finally, all the right columns. """ + # Do not reset the generator + id_generator = ids.standard_identifiers() + l_value_mapping = dict(zip(left.column_ids, id_generator)) + r_value_mapping = dict(zip(right.column_ids, id_generator)) + l_hidden_mapping = { id: guids.generate_guid("hidden_") for id in left._hidden_column_ids } @@ -55,8 +63,8 @@ def join_by_column_ordered( id: guids.generate_guid("hidden_") for id in right._hidden_column_ids } - l_mapping = {**join.get_left_mapping(), **l_hidden_mapping} - r_mapping = {**join.get_right_mapping(), **r_hidden_mapping} + l_mapping = {**l_value_mapping, **l_hidden_mapping} + r_mapping = {**r_value_mapping, **r_hidden_mapping} left_table = left._to_ibis_expr( ordering_mode="unordered", @@ -71,14 +79,14 @@ def join_by_column_ordered( join_conditions = [ value_to_join_key(left_table[l_mapping[left_index]]) == value_to_join_key(right_table[r_mapping[right_index]]) - for left_index, right_index in join.conditions + for left_index, right_index in conditions ] combined_table = ibis.join( left_table, right_table, predicates=join_conditions, - how=join.type, # type: ignore + how=type, # type: ignore ) # Preserve ordering accross joins. @@ -87,7 +95,7 @@ def join_by_column_ordered( right._ordering, l_mapping, r_mapping, - left_order_dominates=(join.type != "right"), + left_order_dominates=(type != "right"), ) # We could filter out the original join columns, but predicates/ordering @@ -116,7 +124,8 @@ def join_by_column_ordered( def join_by_column_unordered( left: compiled.UnorderedIR, right: compiled.UnorderedIR, - join: join_defs.JoinDefinition, + conditions: Tuple[Tuple[str, str], ...], + type: Literal["inner", "outer", "left", "right", "cross"], ) -> compiled.UnorderedIR: """Join two expressions by column equality. @@ -134,9 +143,9 @@ def join_by_column_unordered( first the coalesced join keys, then, all the left columns, and finally, all the right columns. """ - # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - l_mapping = join.get_left_mapping() - r_mapping = join.get_right_mapping() + id_generator = ids.standard_identifiers() + l_mapping = dict(zip(left.column_ids, id_generator)) + r_mapping = dict(zip(right.column_ids, id_generator)) left_table = left._to_ibis_expr( col_id_overrides=l_mapping, ) @@ -146,14 +155,14 @@ def join_by_column_unordered( join_conditions = [ value_to_join_key(left_table[l_mapping[left_index]]) == value_to_join_key(right_table[r_mapping[right_index]]) - for left_index, right_index in join.conditions + for left_index, right_index in conditions ] combined_table = ibis.join( left_table, right_table, predicates=join_conditions, - how=join.type, # type: ignore + how=type, # type: ignore ) # We could filter out the original join columns, but predicates/ordering # might still reference them in implicit joins. diff --git a/bigframes/core/guid.py b/bigframes/core/guid.py index 4eb6c7a9d6..8930d0760a 100644 --- a/bigframes/core/guid.py +++ b/bigframes/core/guid.py @@ -18,4 +18,4 @@ def generate_guid(prefix="col_"): global _GUID_COUNTER _GUID_COUNTER += 1 - return prefix + str(_GUID_COUNTER) + return f"bfuid_{prefix}{_GUID_COUNTER}" diff --git a/bigframes/core/identifiers.py b/bigframes/core/identifiers.py new file mode 100644 index 0000000000..9239c41248 --- /dev/null +++ b/bigframes/core/identifiers.py @@ -0,0 +1,26 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Later, plan on migrating ids to use integers to reduce memory usage allow use of bitmaps to represent column sets + +from typing import Generator + +ID_TYPE = str + + +def standard_identifiers() -> Generator[ID_TYPE, None, None]: + i = 0 + while True: + yield f"col_{i}" + i = i + 1 diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py index 4079abc8fa..5b7b7e45dd 100644 --- a/bigframes/core/join_def.py +++ b/bigframes/core/join_def.py @@ -15,7 +15,9 @@ import dataclasses import enum -from typing import Literal, Mapping, NamedTuple, Tuple +from typing import Literal, NamedTuple + +import bigframes.core.identifiers as ids class JoinSide(enum.Enum): @@ -32,42 +34,21 @@ def inverse(self) -> JoinSide: class JoinCondition(NamedTuple): - left_id: str - right_id: str + left_id: ids.ID_TYPE + right_id: ids.ID_TYPE @dataclasses.dataclass(frozen=True) class JoinColumnMapping: source_table: JoinSide - source_id: str - destination_id: str + source_id: ids.ID_TYPE + destination_id: ids.ID_TYPE @dataclasses.dataclass(frozen=True) class CoalescedColumnMapping: """Special column mapping used only by implicit joiner only""" - left_source_id: str - right_source_id: str - destination_id: str - - -@dataclasses.dataclass(frozen=True) -class JoinDefinition: - conditions: Tuple[JoinCondition, ...] - mappings: Tuple[JoinColumnMapping, ...] - type: JoinType - - def get_left_mapping(self) -> Mapping[str, str]: - return { - i.source_id: i.destination_id - for i in self.mappings - if i.source_table == JoinSide.LEFT - } - - def get_right_mapping(self) -> Mapping[str, str]: - return { - i.source_id: i.destination_id - for i in self.mappings - if i.source_table == JoinSide.RIGHT - } + left_source_id: ids.ID_TYPE + right_source_id: ids.ID_TYPE + destination_id: ids.ID_TYPE diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 27e76c7910..e90ecd06b6 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -26,7 +26,7 @@ import bigframes.core.expression as ex import bigframes.core.guid -from bigframes.core.join_def import JoinColumnMapping, JoinDefinition, JoinSide +import bigframes.core.identifiers as bfet_ids from bigframes.core.ordering import OrderingExpression import bigframes.core.schema as schemata import bigframes.core.window_spec as window @@ -206,7 +206,8 @@ def order_ambiguous(self) -> bool: class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode - join: JoinDefinition + conditions: typing.Tuple[typing.Tuple[str, str], ...] + type: typing.Literal["inner", "outer", "left", "right", "cross"] @property def row_preserving(self) -> bool: @@ -233,19 +234,14 @@ def __hash__(self): @functools.cached_property def schema(self) -> schemata.ArraySchema: - def join_mapping_to_schema_item(mapping: JoinColumnMapping): - result_id = mapping.destination_id - result_dtype = ( - self.left_child.schema.get_type(mapping.source_id) - if mapping.source_table == JoinSide.LEFT - else self.right_child.schema.get_type(mapping.source_id) - ) - return schemata.SchemaItem(result_id, result_dtype) - - items = tuple( - join_mapping_to_schema_item(mapping) for mapping in self.join.mappings + items = [] + schema_items = itertools.chain( + self.left_child.schema.items, self.right_child.schema.items ) - return schemata.ArraySchema(items) + identifiers = bfet_ids.standard_identifiers() + for id, item in zip(identifiers, schema_items): + items.append(schemata.SchemaItem(id, item.dtype)) + return schemata.ArraySchema(tuple(items)) @functools.cached_property def variables_introduced(self) -> int: @@ -545,7 +541,7 @@ def non_local(self) -> bool: @property def schema(self) -> schemata.ArraySchema: - return self.child.schema.prepend( + return self.child.schema.append( schemata.SchemaItem(self.col_id, bigframes.dtypes.INT_DTYPE) ) @@ -626,6 +622,10 @@ def relation_ops_created(self) -> int: class SelectionNode(UnaryNode): input_output_pairs: typing.Tuple[typing.Tuple[str, str], ...] + def __post_init__(self): + for input, _ in self.input_output_pairs: + assert input in self.child.schema.names + def __hash__(self): return self._node_hash diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index ee27c6ff30..03e4de8993 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -92,3 +92,6 @@ def update_dtype( def get_type(self, id: ColumnIdentifierType): return self._mapping[id] + + def __len__(self) -> int: + return len(self.items) From 40113d8079446452b87b9c7e794a86f2635d1ff2 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 11 Sep 2024 14:54:44 -0700 Subject: [PATCH 02/18] refactor: Switch explode node to use column offsets (#978) --- bigframes/core/__init__.py | 8 +++++--- bigframes/core/compile/compiled.py | 11 ++++++++--- bigframes/core/nodes.py | 9 ++++++--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index b4074dd94f..a0563e6edd 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -382,9 +382,8 @@ def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: for column_id in column_ids: assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) - return ArrayValue( - nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids)) - ) + offsets = tuple(self.get_offset_for_name(id) for id in column_ids) + return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets)) def _uniform_sampling(self, fraction: float) -> ArrayValue: """Sampling the table on given fraction. @@ -393,3 +392,6 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: The row numbers of result is non-deterministic, avoid to use. """ return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) + + def get_offset_for_name(self, name: str): + return self.schema.names.index(name) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 38b8fb50e3..2a13ce0da0 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -401,8 +401,9 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) - def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR: + def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR: table = self._to_ibis_expr() + column_ids = tuple(table.columns[offset] for offset in offsets) # The offset array ensures null represents empty arrays after unnesting. offset_array_id = bigframes.core.guid.generate_guid("offset_array_") @@ -712,8 +713,9 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR: ordering=self._ordering, ) - def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: + def explode(self, offsets: typing.Sequence[int]) -> OrderedIR: table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + column_ids = tuple(table.columns[offset] for offset in offsets) offset_array_id = bigframes.core.guid.generate_guid("offset_array_") offset_array = ( @@ -721,7 +723,10 @@ def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: ibis.greatest( 0, ibis.least( - *[table[column_id].length() - 1 for column_id in column_ids] + *[ + table[table.columns[offset]].length() - 1 + for offset in offsets + ] ), ) ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index e90ecd06b6..e11fcafff7 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -42,6 +42,9 @@ OVERHEAD_VARIABLES = 5 +COL_OFFSET = int + + @dataclass(frozen=True) class BigFrameNode: """ @@ -826,7 +829,7 @@ def variables_introduced(self) -> int: @dataclass(frozen=True) class ExplodeNode(UnaryNode): - column_ids: typing.Tuple[str, ...] + column_ids: typing.Tuple[COL_OFFSET, ...] @property def row_preserving(self) -> bool: @@ -844,9 +847,9 @@ def schema(self) -> schemata.ArraySchema: self.child.schema.get_type(name).pyarrow_dtype.value_type ), ) - if name in self.column_ids + if offset in self.column_ids else schemata.SchemaItem(name, self.child.schema.get_type(name)) - for name in self.child.schema.names + for offset, name in enumerate(self.child.schema.names) ) return schemata.ArraySchema(items) From c94ead996e3bfa98edd51ff678a3d43a10ee980f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 12 Sep 2024 09:42:47 -0700 Subject: [PATCH 03/18] chore: apply `remote_function` on the original series without reprojecting (#874) * chore: apply `remote_function` on the original series This change tests application of remote function without reprojecting the original series. * add failing mask doctest as a system test for easier debugging * more comprehensive repr tests * more tests, move to small tests * rename "name" param * manipulate copy of the original udf * move the funciton copy after i/o types resolution * rename all params to avoid collisions, widely use bigframes_ prefix for consistency --- .../functions/_remote_function_session.py | 18 +++ bigframes/functions/remote_function.py | 11 +- bigframes/series.py | 8 +- tests/system/small/test_remote_function.py | 110 ++++++++++++++++++ 4 files changed, 138 insertions(+), 9 deletions(-) diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py index 893b903aeb..c947fcdc63 100644 --- a/bigframes/functions/_remote_function_session.py +++ b/bigframes/functions/_remote_function_session.py @@ -22,6 +22,7 @@ from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union import warnings +import cloudpickle import google.api_core.exceptions from google.cloud import ( bigquery, @@ -458,6 +459,11 @@ def wrapper(func): session=session, # type: ignore ) + # To respect the user code/environment let's use a copy of the + # original udf, especially since we would be setting some properties + # on it + func = cloudpickle.loads(cloudpickle.dumps(func)) + # In the unlikely case where the user is trying to re-deploy the same # function, cleanup the attributes we add below, first. This prevents # the pickle from having dependencies that might not otherwise be @@ -499,6 +505,18 @@ def try_delattr(attr): cloud_function_memory_mib=cloud_function_memory_mib, ) + # TODO(shobs): Find a better way to support udfs with param named "name". + # This causes an issue in the ibis compilation. + func.__signature__ = inspect.signature(func).replace( # type: ignore + parameters=[ + inspect.Parameter( + f"bigframes_{param.name}", + param.kind, + ) + for param in inspect.signature(func).parameters.values() + ] + ) + # TODO: Move ibis logic to compiler step node = ibis.udf.scalar.builtin( func, diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 39e3bfd8f0..b4c74e90d6 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -144,16 +144,21 @@ def read_gbq_function( # The name "args" conflicts with the Ibis operator, so we use # non-standard names for the arguments here. - def func(*ignored_args, **ignored_kwargs): + def func(*bigframes_args, **bigframes_kwargs): f"""Remote function {str(routine_ref)}.""" nonlocal node # type: ignore - expr = node(*ignored_args, **ignored_kwargs) # type: ignore + expr = node(*bigframes_args, **bigframes_kwargs) # type: ignore return ibis_client.execute(expr) func.__signature__ = inspect.signature(func).replace( # type: ignore parameters=[ - inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) + # TODO(shobs): Find a better way to support functions with param + # named "name". This causes an issue in the ibis compilation. + inspect.Parameter( + f"bigframes_{name}", + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) for name in ibis_signature.parameter_names ] ) diff --git a/bigframes/series.py b/bigframes/series.py index 5192a9cf49..d9e3bb19dd 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1481,12 +1481,8 @@ def apply( ex.message += f"\n{_remote_function_recommendation_message}" raise - # We are working with remote function at this point. - # Reproject as workaround to applying filter too late. This forces the - # filter to be applied before passing data to remote function, - # protecting from bad inputs causing errors. - reprojected_series = Series(self._block._force_reproject()) - result_series = reprojected_series._apply_unary_op( + # We are working with remote function at this point + result_series = self._apply_unary_op( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index b000354ed4..5ffda56f92 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import inspect import re import google.api_core.exceptions @@ -972,3 +973,112 @@ def echo_len(row): bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." ): scalars_df[[column]].apply(echo_len_remote, axis=1) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_application_repr(session, dataset_id_permanent): + # This function deliberately has a param with name "name", this is to test + # a specific ibis' internal handling of object names + def should_mask(name: str) -> bool: + hash = 0 + for char_ in name: + hash += ord(char_) + return hash % 2 == 0 + + assert "name" in inspect.signature(should_mask).parameters + + should_mask = session.remote_function( + dataset=dataset_id_permanent, name=get_rf_name(should_mask) + )(should_mask) + + s = bigframes.series.Series(["Alice", "Bob", "Caroline"]) + + repr(s.apply(should_mask)) + repr(s.where(s.apply(should_mask))) + repr(s.where(~s.apply(should_mask))) + repr(s.mask(should_mask)) + repr(s.mask(should_mask, "REDACTED")) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index): + gbq_function = f"{dataset_id}.should_mask" + + # This function deliberately has a param with name "name", this is to test + # a specific ibis' internal handling of object names + session.bqclient.query_and_wait( + f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" + ) + routine = session.bqclient.get_routine(gbq_function) + assert "name" in [arg.name for arg in routine.arguments] + + # read the function and apply to dataframe + should_mask = session.read_gbq_function(gbq_function) + + s = scalars_df_index["string_col"] + + repr(s.apply(should_mask)) + repr(s.where(s.apply(should_mask))) + repr(s.where(~s.apply(should_mask))) + repr(s.mask(should_mask)) + repr(s.mask(should_mask, "REDACTED")) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_apply_after_filter(session, dataset_id_permanent, scalars_dfs): + + # This function is deliberately written to not work with NA input + def plus_one(x: int) -> int: + return x + 1 + + scalars_df, scalars_pandas_df = scalars_dfs + int_col_name_with_nulls = "int64_col" + + # make sure there are NA values in the test column + assert any([pd.isna(val) for val in scalars_df[int_col_name_with_nulls]]) + + # create a remote function + plus_one_remote = session.remote_function( + dataset=dataset_id_permanent, name=get_rf_name(plus_one) + )(plus_one) + + # with nulls in the series the remote function application would fail + with pytest.raises( + google.api_core.exceptions.BadRequest, match="unsupported operand" + ): + scalars_df[int_col_name_with_nulls].apply(plus_one_remote).to_pandas() + + # after filtering out nulls the remote function application should works + # similar to pandas + pd_result = scalars_pandas_df[scalars_pandas_df[int_col_name_with_nulls].notnull()][ + int_col_name_with_nulls + ].apply(plus_one) + bf_result = ( + scalars_df[scalars_df[int_col_name_with_nulls].notnull()][ + int_col_name_with_nulls + ] + .apply(plus_one_remote) + .to_pandas() + ) + + # ignore pandas "int64" vs bigframes "Int64" dtype difference + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_apply_assign_partial_ordering_mode(dataset_id_permanent): + session = bigframes.Session(bigframes.BigQueryOptions(ordering_mode="partial")) + + df = session.read_gbq("bigquery-public-data.baseball.schedules")[ + ["duration_minutes"] + ] + + def plus_one(x: int) -> int: + return x + 1 + + plus_one = session.remote_function( + dataset=dataset_id_permanent, name=get_rf_name(plus_one) + )(plus_one) + + df1 = df.assign(duration_cat=df["duration_minutes"].apply(plus_one)) + repr(df1) From 42b0724fc5859614a0771bcee6cb972056543436 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 12 Sep 2024 18:41:48 -0700 Subject: [PATCH 04/18] test: disable claude3 predict tests to protect bqml capacity (#983) --- tests/system/small/ml/test_llm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index e3d2b51081..cd5d65c458 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -439,6 +439,7 @@ def test_claude3_text_generator_create_load( assert reloaded_model.model_name == model_name +@pytest.mark.skip("b/366290533 too many requests are exhausting bqml capacity") @pytest.mark.parametrize( "model_name", ("claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet", "claude-3-opus"), @@ -458,6 +459,7 @@ def test_claude3_text_generator_predict_default_params_success( ) +@pytest.mark.skip("b/366290533 too many requests are exhausting bqml capacity") @pytest.mark.parametrize( "model_name", ("claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet", "claude-3-opus"), From 30e11d901e72faf02c6d594eaa03c4eb24e69b2a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 13 Sep 2024 10:58:55 -0700 Subject: [PATCH 05/18] refactor: Limit access to ArrayValue node field. (#977) --- bigframes/core/__init__.py | 15 +++++++++++++++ bigframes/core/blocks.py | 14 +++++--------- bigframes/core/validations.py | 2 +- bigframes/session/executor.py | 4 ++-- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index a0563e6edd..aa3aa63cc2 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -36,6 +36,7 @@ import bigframes.core.ordering as orderings import bigframes.core.rewrite import bigframes.core.schema as schemata +import bigframes.core.tree_properties import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes @@ -124,6 +125,20 @@ def schema(self) -> schemata.ArraySchema: def _compiled_schema(self) -> schemata.ArraySchema: return bigframes.core.compile.test_only_ibis_inferred_schema(self.node) + @property + def explicitly_ordered(self) -> bool: + # see BigFrameNode.explicitly_ordered + return self.node.explicitly_ordered + + @property + def order_ambiguous(self) -> bool: + # see BigFrameNode.order_ambiguous + return self.node.order_ambiguous + + @property + def supports_fast_peek(self) -> bool: + return bigframes.core.tree_properties.can_fast_peek(self.node) + def as_cached( self: ArrayValue, cache_table: google.cloud.bigquery.Table, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 42b1a0aeb0..feeed78885 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -49,7 +49,6 @@ import bigframes.core.ordering as ordering import bigframes.core.schema as bf_schema import bigframes.core.sql as sql -import bigframes.core.tree_properties as tree_properties import bigframes.core.utils as utils import bigframes.core.window_spec as window_specs import bigframes.dtypes @@ -205,7 +204,7 @@ def shape(self) -> typing.Tuple[int, int]: row_count_expr = self.expr.row_count() # Support in-memory engines for hermetic unit tests. - if self.expr.node.session is None: + if self.expr.session is None: try: row_count = row_count_expr._try_evaluate_local().squeeze() return (row_count, len(self.value_columns)) @@ -283,7 +282,7 @@ def index_name_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: @property def explicitly_ordered(self) -> bool: - return self.expr.node.explicitly_ordered + return self.expr.explicitly_ordered def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]: """ @@ -466,7 +465,7 @@ def _validate_result_schema( ): actual_schema = tuple(bq_result_schema) ibis_schema = self.expr._compiled_schema - internal_schema = self.expr.node.schema + internal_schema = self.expr.schema if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: return if internal_schema.to_bigquery() != actual_schema: @@ -561,7 +560,7 @@ def to_pandas( def try_peek( self, n: int = 20, force: bool = False ) -> typing.Optional[pd.DataFrame]: - if force or tree_properties.can_fast_peek(self.expr.node): + if force or self.expr.supports_fast_peek: iterator, _ = self.session._peek(self.expr, n) df = self._to_dataframe(iterator) self._copy_index_to_pandas(df) @@ -2365,10 +2364,7 @@ def cached(self, *, force: bool = False, session_aware: bool = False) -> None: if (not force) and self.session._executor._is_trivially_executable(self.expr): return elif session_aware: - bfet_roots = [obj._block._expr.node for obj in self.session.objects] - self.session._executor._cache_with_session_awareness( - self.expr, session_forest=bfet_roots - ) + self.session._executor._cache_with_session_awareness(self.expr) else: self.session._executor._cache_with_cluster_cols( self.expr, cluster_cols=self.index_columns diff --git a/bigframes/core/validations.py b/bigframes/core/validations.py index daa1252824..ca65445dab 100644 --- a/bigframes/core/validations.py +++ b/bigframes/core/validations.py @@ -66,7 +66,7 @@ def enforce_ordered( object: HasSession, opname: str, suggestion: Optional[str] = None ) -> None: session = object._session - if session._strictly_ordered or not object._block.expr.node.order_ambiguous: + if session._strictly_ordered or not object._block.expr.order_ambiguous: # No ambiguity for how to calculate ordering, so no error or warning return None if not session._allows_ambiguity: diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 424e6d7dad..0512fa78e3 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -15,7 +15,7 @@ from __future__ import annotations import math -from typing import cast, Iterable, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import cast, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings import weakref @@ -381,8 +381,8 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): def _cache_with_session_awareness( self, array_value: bigframes.core.ArrayValue, - session_forest: Iterable[nodes.BigFrameNode], ) -> None: + session_forest = [obj._block._expr.node for obj in array_value.session.objects] # These node types are cheap to re-compute target, cluster_cols = bigframes.session.planner.session_aware_cache_plan( array_value.node, list(session_forest) From 0f047b4fae2a10b2a465c506bea561f8bb8d4262 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 13 Sep 2024 13:03:34 -0700 Subject: [PATCH 06/18] fix: DataFrameGroupby.agg now works with unnamed tuples (#985) --- bigframes/core/groupby/__init__.py | 10 ++++----- tests/system/small/test_groupby.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2b80d0389e..a0105f4ef0 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -414,12 +414,10 @@ def _agg_named(self, **kwargs) -> df.DataFrame: raise NotImplementedError( f"Only string aggregate names supported. {constants.FEEDBACK_LINK}" ) - if not hasattr(v, "column") or not hasattr(v, "aggfunc"): - import bigframes.pandas as bpd - - raise TypeError(f"kwargs values must be {bpd.NamedAgg.__qualname__}") - col_id = self._resolve_label(v.column) - aggregations.append((col_id, agg_ops.lookup_agg_func(v.aggfunc))) + if not isinstance(v, tuple) or (len(v) != 2): + raise TypeError("kwargs values must be 2-tuples of column, aggfunc") + col_id = self._resolve_label(v[0]) + aggregations.append((col_id, agg_ops.lookup_agg_func(v[1]))) column_labels.append(k) agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 8e3baff4c2..8574860daa 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -247,6 +247,41 @@ def test_dataframe_groupby_agg_named(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_agg_kw_tuples(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + bf_result = ( + scalars_df_index[col_names] + .groupby("string_col") + .agg( + agg1=("int64_too", "sum"), + agg2=("float64_col", "max"), + ) + ) + pd_result = ( + scalars_pandas_df_index[col_names] + .groupby("string_col") + .agg(agg1=("int64_too", "sum"), agg2=("float64_col", "max")) + ) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + + +@pytest.mark.parametrize( + ("kwargs"), + [ + ({"hello": "world"}), + ({"too_many_fields": ("one", "two", "three")}), + ], +) +def test_dataframe_groupby_agg_kw_error(scalars_df_index, kwargs): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + with pytest.raises( + TypeError, match=r"kwargs values must be 2-tuples of column, aggfunc" + ): + (scalars_df_index[col_names].groupby("string_col").agg(**kwargs)) + + @pytest.mark.parametrize( ("as_index"), [ From 06c3120d5111cf9ecde4decc6af436cc3487ae23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 13 Sep 2024 15:05:23 -0500 Subject: [PATCH 07/18] refactor: remove circular dependencies preventing local doctest runs (#987) With this change I can once again run ``` pytest --doctest-modules third_party/bigframes_vendored/pandas/core/frame.py ``` Note: having multiple `version.py` files should be fine. release-please will update all such files it finds. --- bigframes/bigquery/__init__.py | 3 ++- bigframes/constants.py | 8 -------- bigframes/core/block_transforms.py | 5 +++-- bigframes/core/blocks.py | 2 +- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/core/compile/ibis_types.py | 2 +- bigframes/core/compile/scalar_op_compiler.py | 2 +- bigframes/core/groupby/__init__.py | 2 +- bigframes/core/indexers.py | 2 +- bigframes/core/indexes/base.py | 2 +- bigframes/core/reshape/__init__.py | 2 +- bigframes/core/tools/datetimes.py | 2 +- bigframes/core/validations.py | 7 ++++--- bigframes/dataframe.py | 4 ++-- bigframes/dtypes.py | 3 +-- bigframes/formatting_helpers.py | 3 +-- bigframes/functions/_remote_function_client.py | 2 +- bigframes/functions/_remote_function_session.py | 3 ++- bigframes/functions/remote_function.py | 2 +- bigframes/ml/compose.py | 2 +- bigframes/ml/linear_model.py | 2 +- bigframes/ml/llm.py | 3 ++- bigframes/ml/loader.py | 2 +- bigframes/ml/metrics/_metrics.py | 2 +- bigframes/ml/pipeline.py | 2 +- bigframes/ml/sql.py | 3 +-- bigframes/ml/utils.py | 2 +- bigframes/operations/_matplotlib/core.py | 2 +- bigframes/operations/_matplotlib/hist.py | 2 +- bigframes/operations/base.py | 2 +- bigframes/operations/plotting.py | 2 +- bigframes/operations/strings.py | 2 +- bigframes/pandas/__init__.py | 2 +- bigframes/series.py | 2 +- bigframes/session/__init__.py | 2 +- bigframes/session/_io/bigquery/read_gbq_table.py | 4 ++-- bigframes/session/_io/pandas.py | 4 ++-- bigframes/session/loader.py | 6 ++++-- tests/unit/test_constants.py | 9 ++++++--- tests/unit/test_formatting_helpers.py | 5 +++-- third_party/bigframes_vendored/constants.py | 6 ++---- third_party/bigframes_vendored/version.py | 15 +++++++++++++++ 42 files changed, 77 insertions(+), 64 deletions(-) create mode 100644 third_party/bigframes_vendored/version.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 1e8e8d578d..303120b88a 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -23,7 +23,8 @@ import typing from typing import Literal, Optional, Union -import bigframes.constants as constants +import bigframes_vendored.constants as constants + import bigframes.core.groupby as groupby import bigframes.core.sql import bigframes.ml.utils as utils diff --git a/bigframes/constants.py b/bigframes/constants.py index d6fe699713..4d5b6b8eb3 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -19,14 +19,6 @@ import datetime -import bigframes_vendored.constants - -BF_VERSION = bigframes_vendored.constants.BF_VERSION -FEEDBACK_LINK = bigframes_vendored.constants.FEEDBACK_LINK -ABSTRACT_METHOD_ERROR_MESSAGE = ( - bigframes_vendored.constants.ABSTRACT_METHOD_ERROR_MESSAGE -) - DEFAULT_EXPIRATION = datetime.timedelta(days=7) # https://cloud.google.com/bigquery/docs/locations diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index eaee2e2cc0..eaac0dc785 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -17,9 +17,10 @@ import typing from typing import Sequence +import bigframes_vendored.constants as constants import pandas as pd -import bigframes.constants as constants +import bigframes.constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.expression as ex @@ -117,7 +118,7 @@ def quantile( ) quantile_cols = [] labels = [] - if len(columns) * len(qs) > constants.MAX_COLUMNS: + if len(columns) * len(qs) > bigframes.constants.MAX_COLUMNS: raise NotImplementedError("Too many aggregates requested.") for col in columns: for q in qs: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index feeed78885..3e97b1cb5e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -32,13 +32,13 @@ from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings +import bigframes_vendored.constants as constants import google.cloud.bigquery as bigquery import pandas as pd import pyarrow as pa import bigframes._config.sampling_options as sampling_options import bigframes.constants -import bigframes.constants as constants import bigframes.core as core import bigframes.core.compile.googlesql as googlesql import bigframes.core.expression as ex diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 58973b10eb..488acd63db 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -15,13 +15,13 @@ import typing from typing import cast, Optional +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas as pd -import bigframes.constants as constants import bigframes.core.compile.ibis_types as compile_ibis_types import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index f4ec295d5f..77bfb84425 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -17,6 +17,7 @@ from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union import warnings +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import geopandas as gpd # type: ignore @@ -29,7 +30,6 @@ import pandas as pd import pyarrow as pa -import bigframes.constants as constants import bigframes.dtypes # Type hints for Ibis data types supported by BigQuery DataFrame diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 9e18b391d6..c0f12865d6 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -17,6 +17,7 @@ import functools import typing +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.common.exceptions @@ -26,7 +27,6 @@ import numpy as np import pandas as pd -import bigframes.constants as constants import bigframes.core.compile.ibis_types import bigframes.core.expression as ex import bigframes.dtypes diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index a0105f4ef0..eda517964d 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -17,10 +17,10 @@ import typing from typing import Sequence, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby import pandas as pd -import bigframes.constants as constants from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.block_transforms as block_ops diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index dae5eada70..06d9c4bbab 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -17,10 +17,10 @@ import typing from typing import Tuple, Union +import bigframes_vendored.constants as constants import ibis import pandas as pd -import bigframes.constants as constants import bigframes.core.blocks import bigframes.core.expression as ex import bigframes.core.guid as guid diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0376e37f96..017702b85a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -19,12 +19,12 @@ import typing from typing import Hashable, Optional, Sequence, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index import google.cloud.bigquery as bigquery import numpy as np import pandas -import bigframes.constants as constants import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression as ex diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index a23461bdb9..49ecedcc87 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -16,9 +16,9 @@ import typing from typing import Iterable, Literal, Optional, Union +import bigframes_vendored.constants as constants import pandas as pd -import bigframes.constants as constants import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 5d8d8c9685..2abb86a2f3 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -16,10 +16,10 @@ from datetime import datetime from typing import Optional, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes import pandas as pd -import bigframes.constants as constants import bigframes.dataframe import bigframes.dtypes import bigframes.operations as ops diff --git a/bigframes/core/validations.py b/bigframes/core/validations.py index ca65445dab..701752c9fc 100644 --- a/bigframes/core/validations.py +++ b/bigframes/core/validations.py @@ -19,7 +19,8 @@ import functools from typing import Optional, Protocol, TYPE_CHECKING, Union -import bigframes.constants +import bigframes_vendored.constants as constants + import bigframes.exceptions if TYPE_CHECKING: @@ -72,9 +73,9 @@ def enforce_ordered( if not session._allows_ambiguity: suggestion_substr = suggestion + " " if suggestion else "" raise bigframes.exceptions.OrderRequiredError( - f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{bigframes.constants.FEEDBACK_LINK}" + f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{constants.FEEDBACK_LINK}" ) if not object._block.explicitly_ordered: raise bigframes.exceptions.OrderRequiredError( - f"Op {opname} requires an ordering. Use .sort_values or .sort_index to provide an ordering. {bigframes.constants.FEEDBACK_LINK}" + f"Op {opname} requires an ordering. Use .sort_values or .sort_index to provide an ordering. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2ae6aefe1b..d7bdd5529b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -36,6 +36,7 @@ ) import warnings +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.frame as vendored_pandas_frame import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import google.api_core.exceptions @@ -49,7 +50,6 @@ import bigframes import bigframes._config.display_options as display_options import bigframes.constants -import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops @@ -3106,7 +3106,7 @@ def to_gbq( self._session.bqclient, temp_table_ref, datetime.datetime.now(datetime.timezone.utc) - + constants.DEFAULT_EXPIRATION, + + bigframes.constants.DEFAULT_EXPIRATION, ) if len(labels) != 0: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index bfed783e1e..3cd2507231 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -20,14 +20,13 @@ import typing from typing import Dict, Literal, Union +import bigframes_vendored.constants as constants import geopandas as gpd # type: ignore import google.cloud.bigquery import numpy as np import pandas as pd import pyarrow as pa -import bigframes.constants as constants - # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ pd.BooleanDtype, diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 752aeb7a10..de0ae8cc68 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -19,6 +19,7 @@ import random from typing import Any, Optional, Union +import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions import google.cloud.bigquery as bigquery import humanize @@ -26,8 +27,6 @@ import IPython.display as display import ipywidgets as widgets -import bigframes.constants as constants - GenericJob = Union[ bigquery.LoadJob, bigquery.ExtractJob, bigquery.QueryJob, bigquery.CopyJob ] diff --git a/bigframes/functions/_remote_function_client.py b/bigframes/functions/_remote_function_client.py index 3698bda28b..75385f11a5 100644 --- a/bigframes/functions/_remote_function_client.py +++ b/bigframes/functions/_remote_function_client.py @@ -25,9 +25,9 @@ import tempfile from typing import cast, Tuple, TYPE_CHECKING +from bigframes_vendored import constants import requests -from bigframes import constants import bigframes.functions.remote_function_template if TYPE_CHECKING: diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py index c947fcdc63..0510980178 100644 --- a/bigframes/functions/_remote_function_session.py +++ b/bigframes/functions/_remote_function_session.py @@ -22,6 +22,7 @@ from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union import warnings +import bigframes_vendored.constants as constants import cloudpickle import google.api_core.exceptions from google.cloud import ( @@ -31,7 +32,7 @@ resourcemanager_v3, ) -from bigframes import clients, constants +from bigframes import clients if TYPE_CHECKING: from bigframes.session import Session diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index b4c74e90d6..4dc6c1ad6b 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -24,12 +24,12 @@ if TYPE_CHECKING: from bigframes.session import Session +import bigframes_vendored.constants as constants import google.api_core.exceptions import google.api_core.retry from google.cloud import bigquery import google.iam.v1 -import bigframes.constants as constants import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.functions.remote_function_template diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 4ea63d2e81..3cfa1851f5 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -23,10 +23,10 @@ import typing from typing import cast, Iterable, List, Optional, Set, Tuple, Union +from bigframes_vendored import constants import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery -from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, impute, preprocessing, utils import bigframes.pandas as bpd diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 0816ef9b24..8fe1d6ec27 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -19,12 +19,12 @@ from typing import Dict, List, Literal, Optional, Union +import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.linear_model._base import bigframes_vendored.sklearn.linear_model._logistic from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index a3cd065a55..53a9d40c6e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -19,10 +19,11 @@ from typing import cast, Literal, Optional, Union import warnings +import bigframes_vendored.constants as constants from google.cloud import bigquery import bigframes -from bigframes import clients, constants +from bigframes import clients from bigframes.core import blocks, log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 4e7e808260..de9681660e 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -17,10 +17,10 @@ from types import MappingProxyType from typing import Union +import bigframes_vendored.constants as constants from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.ml import ( cluster, compose, diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index a40c175000..3c2d6514ae 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -19,6 +19,7 @@ import typing from typing import Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.metrics._classification as vendored_metrics_classification import bigframes_vendored.sklearn.metrics._ranking as vendored_metrics_ranking import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression @@ -26,7 +27,6 @@ import pandas as pd import sklearn.metrics as sklearn_metrics # type: ignore -import bigframes.constants as constants from bigframes.ml import utils import bigframes.pandas as bpd diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 4cd60c5836..dc3bd1f3f4 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -20,11 +20,11 @@ from typing import List, Optional, Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.pipeline from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.core import log_adapter from bigframes.ml import ( base, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index d14627f590..7120a5a5fd 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -18,10 +18,9 @@ from typing import Iterable, Literal, Mapping, Optional, Union +import bigframes_vendored.constants as constants import google.cloud.bigquery -import bigframes.constants as constants - # TODO: Add proper escaping logic from core/compile module class BaseSqlGenerator: diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 75dfb916f6..d754b1d002 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -15,9 +15,9 @@ import typing from typing import Any, Iterable, Literal, Mapping, Optional, Union +import bigframes_vendored.constants as constants from google.cloud import bigquery -import bigframes.constants as constants from bigframes.core import blocks import bigframes.pandas as bpd diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index ff8dd86cff..9e59e09877 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -15,9 +15,9 @@ import abc import typing +import bigframes_vendored.constants as constants import pandas as pd -import bigframes.constants as constants import bigframes.dtypes as dtypes DEFAULT_SAMPLING_N = 1000 diff --git a/bigframes/operations/_matplotlib/hist.py b/bigframes/operations/_matplotlib/hist.py index 720b94d7da..213e2abd77 100644 --- a/bigframes/operations/_matplotlib/hist.py +++ b/bigframes/operations/_matplotlib/hist.py @@ -15,10 +15,10 @@ import itertools from typing import Literal +import bigframes_vendored.constants as constants import numpy as np import pandas as pd -import bigframes.constants as constants import bigframes.operations._matplotlib.core as bfplt diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 2f87045415..68f46baded 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -17,10 +17,10 @@ import typing from typing import List, Sequence +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import pandas as pd -import bigframes.constants as constants import bigframes.core.blocks as blocks import bigframes.core.convert import bigframes.core.expression as ex diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index ff74806993..a45b825354 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -14,9 +14,9 @@ import typing +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.plotting._core as vendordt -import bigframes.constants as constants import bigframes.operations._matplotlib as bfplt diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 4af142e0d5..2e40115985 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -17,9 +17,9 @@ import re from typing import cast, Literal, Optional, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr -import bigframes.constants as constants from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 3809384c95..94ea6becab 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -36,6 +36,7 @@ Union, ) +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat import bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge @@ -53,7 +54,6 @@ ) import bigframes._config as config -import bigframes.constants as constants import bigframes.core.blocks import bigframes.core.expression as ex import bigframes.core.global_session as global_session diff --git a/bigframes/series.py b/bigframes/series.py index d9e3bb19dd..d4cb1d3700 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -24,6 +24,7 @@ import typing from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery import numpy @@ -31,7 +32,6 @@ import pandas.core.dtypes.common import typing_extensions -import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 045483bd53..7d0cfaee5c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -37,6 +37,7 @@ import warnings import weakref +import bigframes_vendored.constants as constants import bigframes_vendored.ibis.backends.bigquery # noqa import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet @@ -58,7 +59,6 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.clients -import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 03b26f9460..7585dd3f45 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -23,12 +23,12 @@ from typing import Dict, Iterable, List, Optional, Sequence, Tuple import warnings +import bigframes_vendored.constants as constants import google.api_core.exceptions import google.cloud.bigquery as bigquery import bigframes import bigframes.clients -import bigframes.constants import bigframes.core.compile import bigframes.core.compile.default_ordering import bigframes.core.sql @@ -241,7 +241,7 @@ def get_index_cols( # test, as it's not possible to subclass enums in Python. See: # https://stackoverflow.com/a/33680021/101923 raise NotImplementedError( - f"Got unexpected index_col {repr(index_col)}. {bigframes.constants.FEEDBACK_LINK}" + f"Got unexpected index_col {repr(index_col)}. {constants.FEEDBACK_LINK}" ) elif isinstance(index_col, str): index_cols: List[str] = [index_col] diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 789426a6e3..83e30fd900 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -14,6 +14,7 @@ from typing import Dict, Union +import bigframes_vendored.constants as constants import geopandas # type: ignore import pandas import pandas.arrays @@ -21,7 +22,6 @@ import pyarrow.compute # type: ignore import pyarrow.types # type: ignore -import bigframes.constants import bigframes.features @@ -54,7 +54,7 @@ def arrow_to_pandas( if len(dtypes) != arrow_table.num_columns: raise ValueError( f"Number of types {len(dtypes)} doesn't match number of columns " - f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}" + f"{arrow_table.num_columns}. {constants.FEEDBACK_LINK}" ) serieses = {} diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 924fddce12..ce9874e35f 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -22,6 +22,7 @@ import typing from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions import google.auth.credentials @@ -36,7 +37,7 @@ import pandas import bigframes.clients -import bigframes.constants as constants +import bigframes.constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile @@ -444,7 +445,8 @@ def _read_bigquery_load_job( # hours of the anonymous dataset. table_expiration = bigquery.Table(table_id) table_expiration.expires = ( - datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + datetime.datetime.now(datetime.timezone.utc) + + bigframes.constants.DEFAULT_EXPIRATION ) self._bqclient.update_table(table_expiration, ["expires"]) diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py index aabc09c388..4e11419077 100644 --- a/tests/unit/test_constants.py +++ b/tests/unit/test_constants.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.constants as constants +import bigframes_vendored.constants + +import bigframes.version def test_feedback_link_includes_version(): - assert len(constants.BF_VERSION) > 0 - assert constants.BF_VERSION in constants.FEEDBACK_LINK + version = bigframes.version.__version__ + assert len(version) > 0 + assert version in bigframes_vendored.constants.FEEDBACK_LINK diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index 3c966752c9..d4fe039484 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -14,12 +14,13 @@ import unittest.mock as mock +import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions import google.cloud.bigquery as bigquery import pytest -import bigframes.constants as constants import bigframes.formatting_helpers as formatting_helpers +import bigframes.version def test_wait_for_query_job_error_includes_feedback_link(): @@ -54,4 +55,4 @@ def test_wait_for_job_error_includes_version(): formatting_helpers.wait_for_job(mock_job) cap_exc.match("Test message 123.") - cap_exc.match(constants.BF_VERSION) + cap_exc.match(bigframes.version.__version__) diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index 91084b38f9..1effdffcbe 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -16,14 +16,12 @@ This module should not depend on any others in the package. """ -import bigframes.version - -BF_VERSION = bigframes.version.__version__ +import bigframes_vendored.version FEEDBACK_LINK = ( "Share your usecase with the BigQuery DataFrames team at the " "https://bit.ly/bigframes-feedback survey." - f"You are currently running BigFrames version {BF_VERSION}" + f"You are currently running BigFrames version {bigframes_vendored.version.__version__}" ) ABSTRACT_METHOD_ERROR_MESSAGE = ( diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py new file mode 100644 index 0000000000..2c0c6e4d3a --- /dev/null +++ b/third_party/bigframes_vendored/version.py @@ -0,0 +1,15 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "1.17.0" From 89ea44fb66314b134fc0a10d816c1659978d4182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 13 Sep 2024 17:22:12 -0500 Subject: [PATCH 08/18] deps: update to ibis-framework 9.x and newer sqlglot (#827) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * deps: update to ibis-framework 9.x and newer sqlglot * update sqlglot and ibis * bump minimum pandas * bump pyarrow * fix bfill and ffill * nearly implement describe * remove remaining reference to vendored_ibis_ops.ApproximateMultiQuantile * support ToJsonString * partial support for quantile * fix inmemorytable * fixed Series.explode * nearly fix to_datetime * remove tests I added * patch for python 3.9 support * fix unit tests * fix explode with time type * fix array_agg * fix array_agg for asc order * actually fix array_agg * fix remote function * fix in-memory nullable integer compilation * fix test_df_construct_pandas_default on Python 3.9 * fix ShiftOp windows * fix inf to SQL by treating values as literal in in memory table * fix unit tests for ibis-framework 9.2.0 * fix Python 3.10 unit tests by syncing deps * fixing remote function after merge * fix visit_NonNullLiteral for int types * visit_WindowFunction to fix s.median() method * fix lint * fix s.diff with window * fix mypy * patch visit_And to fix is_monotonic methods * fix mypy and fillna warning * undo window changes for test_series_autocorr * undo fill_null because it was missed at 9.0 version * vendor more of ibis for python 3.9 compatibility * add default arg for nulls_first for python 3.9 support * restore integer conversion * fix window tests: diff, duplicated, shift * fixing ibis parenthesize_inputs bugs and related tests * fixing lint * disable test_query_complexity_error * fix doctest np.int64(0) upgrades * fix doctest np.int64(0) upgrades more * fix groupby diff * addressing system-3.12/doctest issues related to numpy 2.1.1 * fix test_df_apply_axis_1_complex * address compiler errors after merge * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix unit-test compile errors * remove unused ibis codes * fix fillna deprecated warning * add _remove_null_ordering_from_unsupported_window back to fix test_precision_score etc ml tests * fix is_monotonic_decreasing test * fix explode after merge * fix numpy on remote function test * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * ml numpy sql generations --------- Co-authored-by: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Co-authored-by: Chelsea Lin Co-authored-by: Owl Bot --- bigframes/core/block_transforms.py | 7 +- bigframes/core/blocks.py | 15 +- bigframes/core/compile/aggregate_compiler.py | 38 +- bigframes/core/compile/compiled.py | 55 +- bigframes/core/compile/default_ordering.py | 7 +- bigframes/core/compile/scalar_op_compiler.py | 17 +- bigframes/core/compile/single_column.py | 6 +- bigframes/core/groupby/__init__.py | 13 +- bigframes/core/window_spec.py | 1 - bigframes/dataframe.py | 34 +- .../functions/_remote_function_session.py | 3 +- bigframes/functions/remote_function.py | 3 +- bigframes/ml/preprocessing.py | 1 + bigframes/ml/sql.py | 7 +- bigframes/operations/__init__.py | 4 +- bigframes/series.py | 48 +- noxfile.py | 3 +- setup.py | 4 +- testing/constraints-3.11.txt | 3 + testing/constraints-3.12.txt | 3 + testing/constraints-3.9.txt | 4 +- tests/system/conftest.py | 4 +- tests/system/large/test_remote_function.py | 6 +- tests/system/small/test_dataframe.py | 3 + tests/system/small/test_ibis.py | 44 - tests/system/small/test_numpy.py | 7 + tests/system/small/test_series.py | 9 +- .../ibis/backends/bigquery/__init__.py | 3 - .../ibis/backends/bigquery/backend.py | 85 +- .../ibis/backends/bigquery/compiler.py | 59 -- .../ibis/backends/bigquery/registry.py | 72 -- .../ibis/backends/sql/compilers/base.py | 22 +- .../sql/compilers/bigquery/__init__.py | 221 ++--- .../ibis/backends/sql/rewrites.py | 779 +++++++++++------- .../ibis/expr/operations/__init__.py | 1 - .../ibis/expr/operations/arrays.py | 18 - .../ibis/expr/operations/json.py | 2 + .../ibis/expr/operations/reductions.py | 13 +- .../bigframes_vendored/pandas/core/frame.py | 14 +- .../bigframes_vendored/pandas/core/series.py | 60 +- .../sklearn/metrics/_classification.py | 4 +- .../sklearn/metrics/_ranking.py | 8 +- .../sklearn/metrics/_regression.py | 4 +- 43 files changed, 813 insertions(+), 901 deletions(-) delete mode 100644 tests/system/small/test_ibis.py delete mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py delete mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/registry.py delete mode 100644 third_party/bigframes_vendored/ibis/expr/operations/arrays.py diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index eaac0dc785..1990647e0a 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -387,10 +387,9 @@ def value_counts( def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: column_labels = block.column_labels - window_spec = windows.rows( - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, - ) + + # Window framing clause is not allowed for analytic function lag. + window_spec = windows.unbound() original_columns = block.value_columns block, shift_columns = block.multi_apply_window_op( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 3e97b1cb5e..39cfed588f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -50,7 +50,7 @@ import bigframes.core.schema as bf_schema import bigframes.core.sql as sql import bigframes.core.utils as utils -import bigframes.core.window_spec as window_specs +import bigframes.core.window_spec as windows import bigframes.dtypes import bigframes.exceptions import bigframes.features @@ -900,7 +900,7 @@ def multi_apply_window_op( self, columns: typing.Sequence[str], op: agg_ops.WindowOp, - window_spec: window_specs.WindowSpec, + window_spec: windows.WindowSpec, *, skip_null_groups: bool = False, never_skip_nulls: bool = False, @@ -959,7 +959,7 @@ def apply_window_op( self, column: str, op: agg_ops.WindowOp, - window_spec: window_specs.WindowSpec, + window_spec: windows.WindowSpec, *, result_label: Label = None, skip_null_groups: bool = False, @@ -1475,7 +1475,7 @@ def grouped_head( value_columns: typing.Sequence[str], n: int, ): - window_spec = window_specs.cumulative_rows(grouping_keys=tuple(by_column_ids)) + window_spec = windows.cumulative_rows(grouping_keys=tuple(by_column_ids)) block, result_id = self.apply_window_op( value_columns[0], @@ -2383,10 +2383,7 @@ def _is_monotonic( return self._stats_cache[column_name][op_name] period = 1 - window = window_specs.rows( - preceding=period, - following=None, - ) + window_spec = windows.rows() # any NaN value means not monotonic block, last_notna_id = self.apply_unary_op(column_ids[0], ops.notnull_op) @@ -2402,7 +2399,7 @@ def _is_monotonic( last_result_id = None for column_id in column_ids[::-1]: block, lag_result_id = block.apply_window_op( - column_id, agg_ops.ShiftOp(period), window + column_id, agg_ops.ShiftOp(period), window_spec ) block, strict_monotonic_id = block.apply_binary_op( column_id, lag_result_id, ops.gt_op if increasing else ops.lt_op diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 488acd63db..91a3045efb 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -13,7 +13,7 @@ # limitations under the License. import functools import typing -from typing import cast, Optional +from typing import cast, List, Optional import bigframes_vendored.constants as constants import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops @@ -31,6 +31,17 @@ scalar_compiler = scalar_compilers.scalar_op_compiler +# TODO(swast): We can remove this if ibis adds general approx_quantile +# See: https://github.com/ibis-project/ibis/issues/9541 +@ibis.udf.agg.builtin +def approx_quantiles(expression: float, number) -> List[float]: + """APPROX_QUANTILES + + https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles + """ + return [] # pragma: NO COVER + + def compile_aggregate( aggregate: ex.Aggregation, bindings: typing.Dict[str, ibis_types.Value], @@ -176,15 +187,12 @@ def _( column: ibis_types.NumericColumn, window=None, ) -> ibis_types.NumericValue: - # PERCENTILE_CONT has very few allowed windows. For example, "window - # framing clause is not allowed for analytic function percentile_cont". + # APPROX_QUANTILES has very few allowed windows. if window is not None: raise NotImplementedError( f"Approx Quartiles with windowing is not supported. {constants.FEEDBACK_LINK}" ) - value = vendored_ibis_ops.ApproximateMultiQuantile( - column, num_bins=4 # type: ignore - ).to_expr()[op.quartile] + value = approx_quantiles(column, 4)[op.quartile] # type: ignore return cast(ibis_types.NumericValue, value) @@ -513,11 +521,15 @@ def _( column: ibis_types.Column, window=None, ) -> ibis_types.BooleanValue: - # BQ will return null for empty column, result would be true in pandas. - result = _is_true(column).all() + # BQ will return null for empty column, result would be false in pandas. + result = _apply_window_if_present(_is_true(column).all(), window) + literal = ibis_types.literal(True) + return cast( ibis_types.BooleanScalar, - _apply_window_if_present(result, window).fillna(ibis_types.literal(True)), + result.fill_null(literal) + if hasattr(result, "fill_null") + else result.fillna(literal), ) @@ -528,10 +540,14 @@ def _( window=None, ) -> ibis_types.BooleanValue: # BQ will return null for empty column, result would be false in pandas. - result = _is_true(column).any() + result = _apply_window_if_present(_is_true(column).any(), window) + literal = ibis_types.literal(False) + return cast( ibis_types.BooleanScalar, - _apply_window_if_present(result, window).fillna(ibis_types.literal(False)), + result.fill_null(literal) + if hasattr(result, "fill_null") + else result.fillna(literal), ) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 2a13ce0da0..cd00c98381 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -19,10 +19,9 @@ import typing from typing import Collection, Literal, Optional, Sequence -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops +import bigframes_vendored.ibis.backends.bigquery.backend as ibis_bigquery import google.cloud.bigquery import ibis -import ibis.backends.bigquery as ibis_bigquery import ibis.backends.bigquery.datatypes import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes @@ -407,18 +406,13 @@ def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR: # The offset array ensures null represents empty arrays after unnesting. offset_array_id = bigframes.core.guid.generate_guid("offset_array_") - offset_array = ( - vendored_ibis_ops.GenerateArray( - ibis.greatest( - 0, - ibis.least( - *[table[column_id].length() - 1 for column_id in column_ids] - ), - ) - ) - .to_expr() - .name(offset_array_id), - ) + offset_array = ibis.range( + 0, + ibis.greatest( + 1, # We always want at least 1 element to fill in NULLs for empty arrays. + ibis.least(*[table[column_id].length() for column_id in column_ids]), + ), + ).name(offset_array_id) table_w_offset_array = table.select( offset_array, *self._column_names, @@ -718,21 +712,13 @@ def explode(self, offsets: typing.Sequence[int]) -> OrderedIR: column_ids = tuple(table.columns[offset] for offset in offsets) offset_array_id = bigframes.core.guid.generate_guid("offset_array_") - offset_array = ( - vendored_ibis_ops.GenerateArray( - ibis.greatest( - 0, - ibis.least( - *[ - table[table.columns[offset]].length() - 1 - for offset in offsets - ] - ), - ) - ) - .to_expr() - .name(offset_array_id), - ) + offset_array = ibis.range( + 0, + ibis.greatest( + 1, # We always want at least 1 element to fill in NULLs for empty arrays. + ibis.least(*[table[column_id].length() for column_id in column_ids]), + ), + ).name(offset_array_id) table_w_offset_array = table.select( offset_array, *self._column_names, @@ -870,7 +856,7 @@ def project_window_op( clauses = [] if op.skips_nulls and not never_skip_nulls: - clauses.append((column.isnull(), ibis.NA)) + clauses.append((column.isnull(), ibis.null())) if window_spec.min_periods: if op.skips_nulls: # Most operations do not count NULL values towards min_periods @@ -891,7 +877,7 @@ def project_window_op( clauses.append( ( observation_count < ibis_types.literal(window_spec.min_periods), - ibis.NA, + ibis.null(), ) ) if clauses: @@ -1322,9 +1308,10 @@ def _ibis_window_from_spec( bounds.preceding, bounds.following, how="range" ) if isinstance(bounds, RowsWindowBounds): - window = window.preceding_following( - bounds.preceding, bounds.following, how="rows" - ) + if bounds.preceding is not None or bounds.following is not None: + window = window.preceding_following( + bounds.preceding, bounds.following, how="rows" + ) else: raise ValueError(f"unrecognized window bounds {bounds}") return window diff --git a/bigframes/core/compile/default_ordering.py b/bigframes/core/compile/default_ordering.py index 7d7a41f742..a6b625caca 100644 --- a/bigframes/core/compile/default_ordering.py +++ b/bigframes/core/compile/default_ordering.py @@ -49,7 +49,12 @@ def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringVa # Needed for JSON, STRUCT and ARRAY datatypes result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore # Escape backslashes and use backslash as delineator - escaped = cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore + escaped = cast( + ibis_types.StringColumn, + result.fill_null("") if hasattr(result, "fill_null") else result.fillna(""), + ).replace( + "\\", "\\\\" + ) # type: ignore return cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index c0f12865d6..799a408d5b 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -842,7 +842,7 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: - return vendored_ibis_ops.SafeCastToDatetime(x).to_expr() + return x.try_cast(ibis_dtypes.Timestamp(None)) else: # Numerical inputs. if op.format: @@ -995,8 +995,14 @@ def eq_nulls_match_op( y: ibis_types.Value, ): """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" - left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) - right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + literal = ibis_types.literal("$NULL_SENTINEL$") + if hasattr(x, "fill_null"): + left = x.cast(ibis_dtypes.str).fill_null(literal) + right = y.cast(ibis_dtypes.str).fill_null(literal) + else: + left = x.cast(ibis_dtypes.str).fillna(literal) + right = y.cast(ibis_dtypes.str).fillna(literal) + return left == right @@ -1379,7 +1385,10 @@ def fillna_op( x: ibis_types.Value, y: ibis_types.Value, ): - return x.fillna(typing.cast(ibis_types.Scalar, y)) + if hasattr(x, "fill_null"): + return x.fill_null(typing.cast(ibis_types.Scalar, y)) + else: + return x.fillna(typing.cast(ibis_types.Scalar, y)) @scalar_op_compiler.register_binary_op(ops.round_op) diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index 26af969b74..325df8e180 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -179,4 +179,8 @@ def value_to_join_key(value: ibis_types.Value): """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" if not value.type().is_string(): value = value.cast(ibis_dtypes.str) - return value.fillna(ibis_types.literal("$NULL_SENTINEL$")) + return ( + value.fill_null(ibis_types.literal("$NULL_SENTINEL$")) + if hasattr(value, "fill_null") + else value.fillna(ibis_types.literal("$NULL_SENTINEL$")) + ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index eda517964d..5cb0e65729 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -255,19 +255,17 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame: @validations.requires_ordering() def shift(self, periods=1) -> series.Series: - window = window_specs.rows( + # Window framing clause is not allowed for analytic function lag. + window = window_specs.unbound( grouping_keys=tuple(self._by_col_ids), - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) @validations.requires_ordering() def diff(self, periods=1) -> series.Series: + # Window framing clause is not allowed for analytic function lag. window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.DiffOp(periods), window=window) @@ -685,10 +683,9 @@ def cumcount(self, *args, **kwargs) -> series.Series: @validations.requires_ordering() def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" + # Window framing clause is not allowed for analytic function lag. window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) @@ -696,8 +693,6 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.DiffOp(periods), window=window) diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index f011e2848d..3d80afea5a 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -70,7 +70,6 @@ def rows( Returns: WindowSpec """ - assert (preceding is not None) or (following is not None) bounds = RowsWindowBounds(preceding=preceding, following=following) return WindowSpec( grouping_keys=grouping_keys, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d7bdd5529b..d7738a569d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -65,7 +65,7 @@ import bigframes.core.utils as utils import bigframes.core.validations as validations import bigframes.core.window -import bigframes.core.window_spec as window_spec +import bigframes.core.window_spec as windows import bigframes.dtypes import bigframes.exceptions import bigframes.formatting_helpers as formatter @@ -1958,12 +1958,12 @@ def replace( @validations.requires_ordering() def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: - window = window_spec.rows(preceding=limit, following=0) + window = windows.rows(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) @validations.requires_ordering() def bfill(self, *, limit: typing.Optional[int] = None) -> DataFrame: - window = window_spec.rows(preceding=0, following=limit) + window = windows.rows(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def isin(self, values) -> DataFrame: @@ -2670,7 +2670,7 @@ def _perform_join_by_index( @validations.requires_ordering() def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_def = window_spec.rows( + window_def = windows.rows( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( @@ -2679,7 +2679,7 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window @validations.requires_ordering() def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window = window_spec.cumulative_rows(min_periods=min_periods) + window = windows.cumulative_rows(min_periods=min_periods) return bigframes.core.window.Window( self._block, window, self._block.value_columns ) @@ -2790,7 +2790,7 @@ def cumsum(self): raise ValueError("All values must be numeric to apply cumsum.") return self._apply_window_op( agg_ops.sum_op, - window_spec.cumulative_rows(), + windows.cumulative_rows(), ) @validations.requires_ordering() @@ -2803,38 +2803,32 @@ def cumprod(self) -> DataFrame: raise ValueError("All values must be numeric to apply cumsum.") return self._apply_window_op( agg_ops.product_op, - window_spec.cumulative_rows(), + windows.cumulative_rows(), ) @validations.requires_ordering() def cummin(self) -> DataFrame: return self._apply_window_op( agg_ops.min_op, - window_spec.cumulative_rows(), + windows.cumulative_rows(), ) @validations.requires_ordering() def cummax(self) -> DataFrame: return self._apply_window_op( agg_ops.max_op, - window_spec.cumulative_rows(), + windows.cumulative_rows(), ) @validations.requires_ordering() def shift(self, periods: int = 1) -> DataFrame: - window = window_spec.rows( - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, - ) - return self._apply_window_op(agg_ops.ShiftOp(periods), window) + window_spec = windows.rows() + return self._apply_window_op(agg_ops.ShiftOp(periods), window_spec) @validations.requires_ordering() def diff(self, periods: int = 1) -> DataFrame: - window = window_spec.rows( - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, - ) - return self._apply_window_op(agg_ops.DiffOp(periods), window) + window_spec = windows.rows() + return self._apply_window_op(agg_ops.DiffOp(periods), window_spec) @validations.requires_ordering() def pct_change(self, periods: int = 1) -> DataFrame: @@ -2845,7 +2839,7 @@ def pct_change(self, periods: int = 1) -> DataFrame: def _apply_window_op( self, op: agg_ops.WindowOp, - window_spec: window_spec.WindowSpec, + window_spec: windows.WindowSpec, ): block, result_ids = self._block.multi_apply_window_op( self._block.value_columns, diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py index 0510980178..6bc7a4b079 100644 --- a/bigframes/functions/_remote_function_session.py +++ b/bigframes/functions/_remote_function_session.py @@ -522,7 +522,8 @@ def try_delattr(attr): node = ibis.udf.scalar.builtin( func, name=rf_name, - schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + catalog=dataset_ref.project, + database=dataset_ref.dataset_id, signature=(ibis_signature.input_types, ibis_signature.output_type), ) func.bigframes_cloud_function = ( diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 4dc6c1ad6b..1f4fdaad7b 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -170,7 +170,8 @@ def func(*bigframes_args, **bigframes_kwargs): node = ibis.udf.scalar.builtin( func, name=routine_ref.routine_id, - schema=f"{routine_ref.project}.{routine_ref.dataset_id}", + catalog=routine_ref.project, + database=routine_ref.dataset_id, signature=(ibis_signature.input_types, ibis_signature.output_type), ) func.bigframes_remote_function = str(routine_ref) # type: ignore diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 13d2041ef3..2c327f63f8 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -307,6 +307,7 @@ def _compile_to_sql( for column in columns: min_value = X[column].min() max_value = X[column].max() + bin_size = (max_value - min_value) / self.n_bins array_split_points[column] = [ min_value + i * bin_size for i in range(self.n_bins - 1) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 7120a5a5fd..1cb327f19c 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -123,7 +123,12 @@ def ml_bucketize( name: str, ) -> str: """Encode ML.BUCKETIZE for BQML""" - return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}""" + # Use Python value rather than Numpy value to serialization. + points = [ + point.item() if hasattr(point, "item") else point + for point in array_split_points + ] + return f"""ML.BUCKETIZE({numeric_expr_sql}, {points}, FALSE) AS {name}""" def ml_quantile_bucketize( self, diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 51a962b13b..55b8fa1802 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -892,7 +892,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT # Just parameterless unary ops for now # TODO: Parameter mappings -NUMPY_TO_OP: typing.Final = { +NUMPY_TO_OP: dict[np.ufunc, UnaryOp] = { np.sin: sin_op, np.cos: cos_op, np.tan: tan_op, @@ -917,7 +917,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT } -NUMPY_TO_BINOP: typing.Final = { +NUMPY_TO_BINOP: dict[np.ufunc, BinaryOp] = { np.add: add_op, np.subtract: sub_op, np.multiply: mul_op, diff --git a/bigframes/series.py b/bigframes/series.py index d4cb1d3700..3a75ab9ccc 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -45,7 +45,7 @@ import bigframes.core.utils as utils import bigframes.core.validations as validations import bigframes.core.window -import bigframes.core.window_spec +import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes import bigframes.formatting_helpers as formatter @@ -465,13 +465,11 @@ def case_when(self, caselist) -> Series: @validations.requires_ordering() def cumsum(self) -> Series: - return self._apply_window_op( - agg_ops.sum_op, bigframes.core.window_spec.cumulative_rows() - ) + return self._apply_window_op(agg_ops.sum_op, windows.cumulative_rows()) @validations.requires_ordering() def ffill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.window_spec.rows(preceding=limit, following=0) + window = windows.rows(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill @@ -479,42 +477,30 @@ def ffill(self, *, limit: typing.Optional[int] = None) -> Series: @validations.requires_ordering() def bfill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.window_spec.rows(preceding=0, following=limit) + window = windows.rows(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) @validations.requires_ordering() def cummax(self) -> Series: - return self._apply_window_op( - agg_ops.max_op, bigframes.core.window_spec.cumulative_rows() - ) + return self._apply_window_op(agg_ops.max_op, windows.cumulative_rows()) @validations.requires_ordering() def cummin(self) -> Series: - return self._apply_window_op( - agg_ops.min_op, bigframes.core.window_spec.cumulative_rows() - ) + return self._apply_window_op(agg_ops.min_op, windows.cumulative_rows()) @validations.requires_ordering() def cumprod(self) -> Series: - return self._apply_window_op( - agg_ops.product_op, bigframes.core.window_spec.cumulative_rows() - ) + return self._apply_window_op(agg_ops.product_op, windows.cumulative_rows()) @validations.requires_ordering() def shift(self, periods: int = 1) -> Series: - window = bigframes.core.window_spec.rows( - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, - ) - return self._apply_window_op(agg_ops.ShiftOp(periods), window) + window_spec = windows.rows() + return self._apply_window_op(agg_ops.ShiftOp(periods), window_spec) @validations.requires_ordering() def diff(self, periods: int = 1) -> Series: - window = bigframes.core.window_spec.rows( - preceding=periods if periods > 0 else None, - following=-periods if periods < 0 else None, - ) - return self._apply_window_op(agg_ops.DiffOp(periods), window) + window_spec = windows.rows() + return self._apply_window_op(agg_ops.DiffOp(periods), window_spec) @validations.requires_ordering() def pct_change(self, periods: int = 1) -> Series: @@ -1053,7 +1039,7 @@ def mode(self) -> Series: block, max_value_count_col_id = block.apply_window_op( value_count_col_id, agg_ops.max_op, - window_spec=bigframes.core.window_spec.unbound(), + window_spec=windows.unbound(), ) block, is_mode_col_id = block.apply_binary_op( value_count_col_id, @@ -1286,9 +1272,7 @@ def _apply_aggregation( ) -> Any: return self._block.get_stat(self._value_column, op) - def _apply_window_op( - self, op: agg_ops.WindowOp, window_spec: bigframes.core.window_spec.WindowSpec - ): + def _apply_window_op(self, op: agg_ops.WindowOp, window_spec: windows.WindowSpec): block = self._block block, result_id = block.apply_window_op( self._value_column, op, window_spec=window_spec, result_label=self.name @@ -1345,7 +1329,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: @validations.requires_ordering() def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = bigframes.core.window_spec.rows( + window_spec = windows.rows( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( @@ -1354,9 +1338,7 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window @validations.requires_ordering() def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window_spec = bigframes.core.window_spec.cumulative_rows( - min_periods=min_periods - ) + window_spec = windows.cumulative_rows(min_periods=min_periods) return bigframes.core.window.Window( self._block, window_spec, self._block.value_columns, is_series=True ) diff --git a/noxfile.py b/noxfile.py index 5dbcdea583..f4142af21b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -62,8 +62,9 @@ UNIT_TEST_EXTRAS: List[str] = [] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +# There are 4 different ibis-framework 9.x versions we want to test against. # 3.10 is needed for Windows tests. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", diff --git a/setup.py b/setup.py index 2ae61a44f3..908a3c244f 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", + "ibis-framework[bigquery] >=9.0.0,<=9.3.0", "jellyfish >=0.8.9", "numpy >=1.24.0", "pandas >=1.5.3", @@ -59,7 +59,7 @@ # Keep sqlglot versions in sync with ibis-framework. This avoids problems # where the incorrect version of sqlglot is installed, such as # https://github.com/googleapis/python-bigquery-dataframes/issues/315 - "sqlglot >=20.8.0,<=20.11", + "sqlglot >=23.6.3,<25.2", "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index e69de29bb2..60ac0af60f 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -0,0 +1,3 @@ +# Some internal modules have moved, +# so make sure we test on all ibis-framework 9.x versions. +ibis-framework==9.1.0 diff --git a/testing/constraints-3.12.txt b/testing/constraints-3.12.txt index e69de29bb2..dbbb5a2d88 100644 --- a/testing/constraints-3.12.txt +++ b/testing/constraints-3.12.txt @@ -0,0 +1,3 @@ +# Some internal modules have moved, +# so make sure we test on all ibis-framework 9.x versions. +ibis-framework==9.2.0 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index a446a4bfa7..4a9d1ae281 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -12,7 +12,7 @@ google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 -ibis-framework==8.0.0 +ibis-framework==9.0.0 jellyfish==0.8.9 numpy==1.24.0 pandas==1.5.3 @@ -21,7 +21,7 @@ pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 sqlalchemy==1.4 -sqlglot==20.8.0 +sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index b382a5593c..49cd887cfd 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -29,7 +29,7 @@ import google.cloud.functions_v2 as functions_v2 import google.cloud.resourcemanager_v3 as resourcemanager_v3 import google.cloud.storage as storage # type: ignore -import ibis.backends.base +import ibis.backends import numpy as np import pandas as pd import pytest @@ -105,7 +105,7 @@ def bigquery_client_tokyo(session_tokyo: bigframes.Session) -> bigquery.Client: @pytest.fixture(scope="session") -def ibis_client(session: bigframes.Session) -> ibis.backends.base.BaseBackend: +def ibis_client(session: bigframes.Session) -> ibis.backends.BaseBackend: return session.ibis_client diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 77ea4627ec..e224f65a01 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1726,8 +1726,10 @@ def test_df_apply_axis_1_complex(session, pd_df): def serialize_row(row): custom = { - "name": row.name, - "index": [idx for idx in row.index], + "name": row.name.item() if hasattr(row.name, "item") else row.name, + "index": [ + idx.item() if hasattr(idx, "item") else idx for idx in row.index + ], "values": [ val.item() if hasattr(val, "item") else val for val in row.values ], diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f51b597650..9e046dc62e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4568,6 +4568,9 @@ def test_recursion_limit(scalars_df_index): scalars_df_index.to_pandas() +@pytest.mark.skipif( + reason="b/366477265: Skip until query complexity error can be reliably triggered." +) def test_query_complexity_error(scalars_df_index): # This test requires automatic caching/query decomposition to be turned off bf_df = scalars_df_index diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py deleted file mode 100644 index e2648d1eba..0000000000 --- a/tests/system/small/test_ibis.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for monkeypatched ibis code.""" - -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops -import ibis.expr.types as ibis_types - -import bigframes - - -def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): - num_bins = 3 - ibis_client = session.ibis_client - project, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table( # type: ignore - table_id, - schema=dataset, - database=project, - ) - ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] - quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( - ibis_column, # type: ignore - num_bins=num_bins, # type: ignore - ).to_expr() - value = quantiles[1] - num_edges = quantiles.length() - - sql = ibis_client.compile(value) - num_edges_result = num_edges.to_pandas() - - assert "APPROX_QUANTILES" in sql - assert num_edges_result == num_bins + 1 diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index 8f62d9628c..37a707b9d0 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -70,6 +70,13 @@ def test_df_ufuncs(scalars_dfs, opname): ).to_pandas() pd_result = getattr(np, opname)(scalars_pandas_df[["float64_col", "int64_col"]]) + # In NumPy versions 2 and later, `np.floor` and `np.ceil` now produce integer + # outputs for the "int64_col" column. + if opname in ["floor", "ceil"] and isinstance( + pd_result["int64_col"].dtypes, pd.Int64Dtype + ): + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.Float64Dtype()) + pd.testing.assert_frame_equal(bf_result, pd_result) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 9a6783ee5c..b8f7926aec 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2346,8 +2346,13 @@ def test_value_counts(scalars_dfs, kwargs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = scalars_df[col_name].value_counts(**kwargs).to_pandas() - pd_result = scalars_pandas_df[col_name].value_counts(**kwargs) + # Pandas `value_counts` can produce non-deterministic results with tied counts. + # Remove duplicates to enforce a consistent output. + s = scalars_df[col_name].drop(0) + pd_s = scalars_pandas_df[col_name].drop(0) + + bf_result = s.value_counts(**kwargs).to_pandas() + pd_result = pd_s.value_counts(**kwargs) pd.testing.assert_series_equal( bf_result, diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index 1d2d05a741..e69de29bb2 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -1,3 +0,0 @@ -# Import all sub-modules to monkeypatch everything. -import bigframes_vendored.ibis.backends.bigquery.compiler # noqa -import bigframes_vendored.ibis.backends.bigquery.registry # noqa diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/backend.py b/third_party/bigframes_vendored/ibis/backends/bigquery/backend.py index f917ef950d..d4d5156572 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/backend.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/backend.py @@ -8,10 +8,10 @@ import contextlib import glob import os -import re from typing import Any, Optional, TYPE_CHECKING from bigframes_vendored.ibis.backends.bigquery.datatypes import BigQueryType +import bigframes_vendored.ibis.backends.sql.compilers as sc import google.api_core.exceptions import google.auth.credentials import google.cloud.bigquery as bq @@ -27,7 +27,6 @@ ) from ibis.backends.bigquery.datatypes import BigQuerySchema from ibis.backends.sql import SQLBackend -import ibis.backends.sql.compilers as sc import ibis.common.exceptions as com import ibis.expr.operations as ops import ibis.expr.schema as sch @@ -81,52 +80,6 @@ def _create_client_info_gapic(application_name): return ClientInfo(user_agent=_create_user_agent(application_name)) -_MEMTABLE_PATTERN = re.compile( - r"^_?ibis_(?:[A-Za-z_][A-Za-z_0-9]*)_memtable_[a-z0-9]{26}$" -) - - -def _qualify_memtable( - node: sge.Expression, *, dataset: str | None, project: str | None -) -> sge.Expression: - """Add a BigQuery dataset and project to memtable references.""" - if isinstance(node, sge.Table) and _MEMTABLE_PATTERN.match(node.name) is not None: - node.args["db"] = dataset - node.args["catalog"] = project - # make sure to quote table location - node = _force_quote_table(node) - return node - - -def _remove_null_ordering_from_unsupported_window( - node: sge.Expression, -) -> sge.Expression: - """Remove null ordering in window frame clauses not supported by BigQuery. - - BigQuery has only partial support for NULL FIRST/LAST in RANGE windows so - we remove it from any window frame clause that doesn't support it. - - Here's the support matrix: - - ✅ sum(x) over (order by y desc nulls last) - 🚫 sum(x) over (order by y asc nulls last) - ✅ sum(x) over (order by y asc nulls first) - 🚫 sum(x) over (order by y desc nulls first) - """ - if isinstance(node, sge.Window): - order = node.args.get("order") - if order is not None: - for key in order.args["expressions"]: - kargs = key.args - if kargs.get("desc") is True and kargs.get("nulls_first", False): - kargs["nulls_first"] = False - elif kargs.get("desc") is False and not kargs.setdefault( - "nulls_first", True - ): - kargs["nulls_first"] = True - return node - - def _force_quote_table(table: sge.Table) -> sge.Table: """Force quote all the parts of a bigquery path. @@ -156,6 +109,10 @@ class Backend(SQLBackend, CanCreateDatabase, CanCreateSchema): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.__session_dataset: bq.DatasetReference | None = None + self._query_cache.lookup = lambda name: self.table( + name, + database=(self._session_dataset.project, self._session_dataset.dataset_id), + ).op() @property def _session_dataset(self): @@ -163,32 +120,6 @@ def _session_dataset(self): self.__session_dataset = self._make_session() return self.__session_dataset - def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: - raw_name = op.name - - session_dataset = self._session_dataset - project = session_dataset.project - dataset = session_dataset.dataset_id - - table_ref = bq.TableReference(session_dataset, raw_name) - try: - self.client.get_table(table_ref) - except google.api_core.exceptions.NotFound: - table_id = sg.table( - raw_name, db=dataset, catalog=project, quoted=False - ).sql(dialect=self.name) - bq_schema = BigQuerySchema.from_ibis(op.schema) - load_job = self.client.load_table_from_dataframe( - op.data.to_frame(), - table_id, - job_config=bq.LoadJobConfig( - # fail if the table already exists and contains data - write_disposition=bq.WriteDisposition.WRITE_EMPTY, - schema=bq_schema, - ), - ) - load_job.result() - def _read_file( self, path: str | Path, @@ -793,7 +724,6 @@ def to_pyarrow( **kwargs: Any, ) -> pa.Table: self._import_pyarrow() - self._register_in_memory_tables(expr) sql = self.compile(expr, limit=limit, params=params, **kwargs) self._log(sql) query = self.raw_sql(sql, params=params, **kwargs) @@ -816,7 +746,6 @@ def to_pyarrow_batches( schema = expr.as_table().schema() - self._register_in_memory_tables(expr) sql = self.compile(expr, limit=limit, params=params, **kwargs) self._log(sql) query = self.raw_sql(sql, params=params, page_size=chunk_size, **kwargs) @@ -1009,9 +938,6 @@ def create_table( if obj is not None and not isinstance(obj, ir.Table): obj = ibis.memtable(obj, schema=schema) - if obj is not None: - self._register_in_memory_tables(obj) - if temp: dataset = self._session_dataset.dataset_id if database is not None: @@ -1107,7 +1033,6 @@ def create_view( expression=self.compile(obj), replace=overwrite, ) - self._register_in_memory_tables(obj) self.raw_sql(stmt.sql(self.name)) return self.table(name, database=(catalog, database)) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py deleted file mode 100644 index 414f0a7c81..0000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py +++ /dev/null @@ -1,59 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/compiler.py -"""Module to convert from Ibis expression to SQL string.""" - -from __future__ import annotations - -import re - -from ibis.backends.base.sql import compiler as sql_compiler -import ibis.backends.bigquery.compiler -from ibis.backends.bigquery.datatypes import BigQueryType -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops - -_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+') -_EXACT_NAME_REGEX = re.compile(f"^{_NAME_REGEX.pattern}$") - - -class BigQueryTableSetFormatter(sql_compiler.TableSetFormatter): - def _quote_identifier(self, name): - """Restore 6.x version of identifier quoting. - - 7.x uses sqlglot which as of December 2023 doesn't know about the - extended unicode names for BigQuery yet. - """ - if _EXACT_NAME_REGEX.match(name) is not None: - return name - return f"`{name}`" - - def _format_in_memory_table(self, op): - """Restore 6.x version of InMemoryTable. - - BigQuery DataFrames explicitly uses InMemoryTable only when we know - the data is small enough to embed in SQL. - """ - schema = op.schema - names = schema.names - types = schema.types - - raw_rows = [] - for row in op.data.to_frame().itertuples(index=False): - raw_row = ", ".join( - f"{self._translate(lit)} AS {name}" - for lit, name in zip( - map(ops.Literal, row, types), map(self._quote_identifier, names) - ) - ) - raw_rows.append(f"STRUCT({raw_row})") - array_type = BigQueryType.from_ibis(dt.Array(op.schema.as_struct())) - - return f"UNNEST({array_type}[{', '.join(raw_rows)}])" - - -# Override implementation. -ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._quote_identifier = ( - BigQueryTableSetFormatter._quote_identifier -) -ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._format_in_memory_table = ( - BigQueryTableSetFormatter._format_in_memory_table -) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py deleted file mode 100644 index ecef2115e5..0000000000 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ /dev/null @@ -1,72 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/registry.py -"""Module to convert from Ibis expression to SQL string.""" - -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops -from ibis.backends.bigquery.registry import OPERATION_REGISTRY -import ibis.expr.operations.reductions as ibis_reductions - - -def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile): - arg = translator.translate(op.arg) - num_bins = translator.translate(op.num_bins) - return f"APPROX_QUANTILES({arg}, {num_bins})" - - -def _first_non_null_value(translator, op: vendored_ibis_ops.FirstNonNullValue): - arg = translator.translate(op.arg) - return f"FIRST_VALUE({arg} IGNORE NULLS)" - - -def _last_non_null_value(translator, op: vendored_ibis_ops.LastNonNullValue): - arg = translator.translate(op.arg) - return f"LAST_VALUE({arg} IGNORE NULLS)" - - -def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString): - arg = translator.translate(op.arg) - return f"TO_JSON_STRING({arg})" - - -def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): - arg = translator.translate(op.arg) - return f"GENERATE_ARRAY(0, {arg})" - - -def _safe_cast_to_datetime(translator, op: vendored_ibis_ops.SafeCastToDatetime): - arg = translator.translate(op.arg) - return f"SAFE_CAST({arg} AS DATETIME)" - - -def _quantile(translator, op: ibis_reductions.Quantile): - arg = translator.translate(op.arg) - quantile = translator.translate(op.quantile) - return f"PERCENTILE_CONT({arg}, {quantile})" - - -def _array_aggregate(translator, op: vendored_ibis_ops.ArrayAggregate): - """This method provides the same functionality as the collect() method in Ibis, with - the added capability of ordering the results using order_by. - https://github.com/ibis-project/ibis/issues/9170 - """ - arg = translator.translate(op.arg) - - order_by_sql = "" - if len(op.order_by) > 0: - order_by = ", ".join([translator.translate(column) for column in op.order_by]) - order_by_sql = f"ORDER BY {order_by}" - - return f"ARRAY_AGG({arg} IGNORE NULLS {order_by_sql})" - - -patched_ops = { - vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore - vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore - vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore - vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore - vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore - vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore - ibis_reductions.Quantile: _quantile, # type:ignore - vendored_ibis_ops.ArrayAggregate: _array_aggregate, # type:ignore -} - -OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index c74de82099..cbd8e4e2d9 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -610,7 +610,6 @@ def translate(self, op, *, params: Mapping[ir.Value, Any]) -> sge.Expression: op, params=params, rewrites=self.rewrites, - post_rewrites=self.post_rewrites, fuse_selects=options.sql.fuse_selects, ) @@ -1125,7 +1124,7 @@ def visit_Coalesce(self, op, *, arg): ### Ordering and window functions - def visit_SortKey(self, op, *, expr, ascending: bool, nulls_first: bool): + def visit_SortKey(self, op, *, expr, ascending: bool, nulls_first: bool = False): return sge.Ordered(this=expr, desc=not ascending, nulls_first=nulls_first) def visit_ApproxMedian(self, op, *, arg, where): @@ -1262,11 +1261,9 @@ def _cleanup_names(self, exprs: Mapping[str, sge.Expression]): else: yield value.as_(name, quoted=self.quoted, copy=False) - def visit_Select( - self, op, *, parent, selections, predicates, qualified, sort_keys, distinct - ): + def visit_Select(self, op, *, parent, selections, predicates, qualified, sort_keys): # if we've constructed a useless projection return the parent relation - if not (selections or predicates or qualified or sort_keys or distinct): + if not (selections or predicates or qualified or sort_keys): return parent result = parent @@ -1293,9 +1290,6 @@ def visit_Select( if sort_keys: result = result.order_by(*sort_keys, copy=False) - if distinct: - result = result.distinct() - return result def visit_DummyTable(self, op, *, values): @@ -1392,7 +1386,10 @@ def visit_Aggregate(self, op, *, parent, groups, metrics): @classmethod def _add_parens(cls, op, sg_expr): - if isinstance(op, cls.NEEDS_PARENS): + # Patch for https://github.com/ibis-project/ibis/issues/9975 + if isinstance(op, cls.NEEDS_PARENS) or ( + isinstance(op, ops.Alias) and isinstance(op.arg, cls.NEEDS_PARENS) + ): return sge.paren(sg_expr, copy=False) return sg_expr @@ -1480,6 +1477,11 @@ def visit_Limit(self, op, *, parent, n, offset): return result.subquery(alias, copy=False) return result + def visit_Distinct(self, op, *, parent): + return ( + sg.select(STAR, copy=False).distinct(copy=False).from_(parent, copy=False) + ) + def visit_CTE(self, op, *, parent): return sg.table(parent.alias_or_name, quoted=self.quoted) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index fc8d93a433..3015991a26 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1,5 +1,4 @@ # Contains code from https://github.com/ibis-project/ibis/blob/main/ibis/backends/sql/compilers/bigquery/__init__.py - """Module to convert from Ibis expression to SQL string.""" from __future__ import annotations @@ -9,28 +8,25 @@ import re from typing import Any, TYPE_CHECKING -from bigframes_vendored.ibis.backends.bigquery.datatypes import ( - BigQueryType, - BigQueryUDFType, -) +import bigframes_vendored.ibis.backends.bigquery.datatypes as bq_datatypes from bigframes_vendored.ibis.backends.sql.compilers.base import ( AggGen, NULL, SQLGlotCompiler, STAR, ) -from bigframes_vendored.ibis.backends.sql.rewrites import ( +from ibis import util +from ibis.backends.sql.datatypes import BigQueryType, BigQueryUDFType +from ibis.backends.sql.rewrites import ( exclude_unsupported_window_frame_from_ops, exclude_unsupported_window_frame_from_rank, exclude_unsupported_window_frame_from_row_number, - split_select_distinct_with_order_by, ) -from ibis import util -from ibis.backends.sql.compilers.bigquery.udf.core import PythonToJavaScriptTranslator import ibis.common.exceptions as com from ibis.common.temporal import DateUnit, IntervalUnit, TimestampUnit, TimeUnit import ibis.expr.datatypes as dt import ibis.expr.operations as ops +import numpy as np import sqlglot as sg from sqlglot.dialects import BigQuery import sqlglot.expressions as sge @@ -40,6 +36,7 @@ import ibis.expr.types as ir + _NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+') @@ -55,8 +52,6 @@ def _qualify_memtable( if isinstance(node, sge.Table) and _MEMTABLE_PATTERN.match(node.name) is not None: node.args["db"] = dataset node.args["catalog"] = project - # make sure to quote table location - node = _force_quote_table(node) return node @@ -64,12 +59,9 @@ def _remove_null_ordering_from_unsupported_window( node: sge.Expression, ) -> sge.Expression: """Remove null ordering in window frame clauses not supported by BigQuery. - BigQuery has only partial support for NULL FIRST/LAST in RANGE windows so we remove it from any window frame clause that doesn't support it. - Here's the support matrix: - ✅ sum(x) over (order by y desc nulls last) 🚫 sum(x) over (order by y asc nulls last) ✅ sum(x) over (order by y asc nulls first) @@ -89,27 +81,6 @@ def _remove_null_ordering_from_unsupported_window( return node -def _force_quote_table(table: sge.Table) -> sge.Table: - """Force quote all the parts of a bigquery path. - - The BigQuery identifier quoting semantics are bonkers - https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers - - my-table is OK, but not mydataset.my-table - - mytable-287 is OK, but not mytable-287a - - Just quote everything. - """ - for key in ("this", "db", "catalog"): - if (val := table.args[key]) is not None: - if isinstance(val, sg.exp.Identifier) and not val.quoted: - val.args["quoted"] = True - else: - table.args[key] = sg.to_identifier(val, quoted=True) - return table - - class BigQueryCompiler(SQLGlotCompiler): dialect = BigQuery type_mapper = BigQueryType @@ -123,7 +94,6 @@ class BigQueryCompiler(SQLGlotCompiler): exclude_unsupported_window_frame_from_rank, *SQLGlotCompiler.rewrites, ) - post_rewrites = (split_select_distinct_with_order_by,) supports_qualify = True @@ -214,7 +184,6 @@ def to_sqlglot( session_project: str | None = None, ) -> Any: """Compile an Ibis expression. - Parameters ---------- expr @@ -228,18 +197,16 @@ def to_sqlglot( Optional dataset ID to qualify memtable references. session_project Optional project ID to qualify memtable references. - Returns ------- Any The output of compilation. The type of this value depends on the backend. - """ sql = super().to_sqlglot(expr, limit=limit, params=params) table_expr = expr.as_table() - geocols = table_expr.schema().geospatial + geocols = getattr(table_expr.schema(), "geospatial", None) result = sql.transform( _qualify_memtable, @@ -280,64 +247,6 @@ def to_sqlglot( sources.append(result) return sources - def _compile_python_udf(self, udf_node: ops.ScalarUDF) -> sge.Create: - name = type(udf_node).__name__ - type_mapper = self.udf_type_mapper - - body = PythonToJavaScriptTranslator(udf_node.__func__).compile() - config = udf_node.__config__ - libraries = config.get("libraries", []) - - signature = [ - sge.ColumnDef( - this=sg.to_identifier(name, quoted=self.quoted), - kind=type_mapper.from_ibis(param.annotation.pattern.dtype), - ) - for name, param in udf_node.__signature__.parameters.items() - ] - - lines = ['"""'] - - if config.get("strict", True): - lines.append('"use strict";') - - lines += [ - body, - "", - f"return {udf_node.__func_name__}({', '.join(udf_node.argnames)});", - '"""', - ] - - func = sge.Create( - kind="FUNCTION", - this=sge.UserDefinedFunction( - this=sg.to_identifier(name), expressions=signature, wrapped=True - ), - # not exactly what I had in mind, but it works - # - # quoting is too simplistic to handle multiline strings - expression=sge.Var(this="\n".join(lines)), - exists=False, - properties=sge.Properties( - expressions=[ - sge.TemporaryProperty(), - sge.ReturnsProperty(this=type_mapper.from_ibis(udf_node.dtype)), - sge.StabilityProperty( - this="IMMUTABLE" if config.get("determinism") else "VOLATILE" - ), - sge.LanguageProperty(this=sg.to_identifier("js")), - ] - + [ - sge.Property( - this=sg.to_identifier("library"), value=self.f.array(*libraries) - ) - ] - * bool(libraries) - ), - ) - - return func - @staticmethod def _minimize_spec(start, end, spec): if ( @@ -528,7 +437,7 @@ def visit_ArrayContains(self, op, *, arg, other): def visit_StringContains(self, op, *, haystack, needle): return self.f.strpos(haystack, needle) > 0 - def visti_StringFind(self, op, *, arg, substr, start, end): + def visit_StringFind(self, op, *, arg, substr, start, end): if start is not None: raise NotImplementedError( "`start` not implemented for BigQuery string find" @@ -573,6 +482,9 @@ def visit_NonNullLiteral(self, op, *, value, dtype): ) elif dtype.is_uuid(): return sge.convert(str(value)) + + elif dtype.is_int64(): + return sge.convert(np.int64(value)) return None def visit_IntervalFromInteger(self, op, *, arg, unit): @@ -1110,5 +1022,116 @@ def visit_ArrayAny(self, op, *, arg): def visit_ArrayAll(self, op, *, arg): return self._array_reduction(arg=arg, reduction="logical_and") + # Customized ops for bigframes + + def visit_InMemoryTable(self, op, *, name, schema, data): + # Avoid creating temp tables for small data, which is how memtable is + # used in BigQuery DataFrames. Inspired by: + # https://github.com/ibis-project/ibis/blob/efa6fb72bf4c790450d00a926d7bd809dade5902/ibis/backends/druid/compiler.py#L95 + tuples = data.to_frame().itertuples(index=False) + quoted = self.quoted + columns = [sg.column(col, quoted=quoted) for col in schema.names] + array_expr = sge.DataType( + this=sge.DataType.Type.STRUCT, + expressions=[ + sge.ColumnDef( + this=sge.to_identifier(field, quoted=self.quoted), + kind=bq_datatypes.BigQueryType.from_ibis(type_), + ) + for field, type_ in zip(schema.names, schema.types) + ], + nested=True, + ) + array_values = [ + sge.Tuple( + expressions=tuple( + self.visit_Literal(None, value=value, dtype=type_) + for value, type_ in zip(row, schema.types) + ) + ) + for row in tuples + ] + expr = sge.Unnest( + expressions=[ + sge.DataType( + this=sge.DataType.Type.ARRAY, + expressions=[array_expr], + nested=True, + values=array_values, + ), + ], + alias=sge.TableAlias( + this=sg.to_identifier(name, quoted=quoted), + columns=columns, + ), + ) + # return expr + return sg.select(sge.Star()).from_(expr) + + def visit_ArrayAggregate(self, op, *, arg, order_by, where): + if len(order_by) > 0: + expr = sge.Order( + this=arg, + expressions=[ + # Avoid adding NULLS FIRST / NULLS LAST in SQL, which is + # unsupported in ARRAY_AGG by reconstructing the node as + # plain SQL text. + f"({order_column.args['this'].sql(dialect='bigquery')}) {'DESC' if order_column.args.get('desc') else 'ASC'}" + for order_column in order_by + ], + ) + else: + expr = arg + return sge.IgnoreNulls(this=self.agg.array_agg(expr, where=where)) + + def visit_FirstNonNullValue(self, op, *, arg): + return sge.IgnoreNulls(this=sge.FirstValue(this=arg)) + + def visit_LastNonNullValue(self, op, *, arg): + return sge.IgnoreNulls(this=sge.LastValue(this=arg)) + + def visit_ToJsonString(self, op, *, arg): + return self.f.to_json_string(arg) + + def visit_Quantile(self, op, *, arg, quantile, where): + return sge.PercentileCont(this=arg, expression=quantile) + + def visit_WindowFunction(self, op, *, how, func, start, end, group_by, order_by): + # Patch for https://github.com/ibis-project/ibis/issues/9872 + if start is None and end is None: + spec = None + else: + if start is None: + start = {} + if end is None: + end = {} + + start_value = start.get("value", "UNBOUNDED") + start_side = start.get("side", "PRECEDING") + end_value = end.get("value", "UNBOUNDED") + end_side = end.get("side", "FOLLOWING") + + if getattr(start_value, "this", None) == "0": + start_value = "CURRENT ROW" + start_side = None + + if getattr(end_value, "this", None) == "0": + end_value = "CURRENT ROW" + end_side = None + + spec = sge.WindowSpec( + kind=how.upper(), + start=start_value, + start_side=start_side, + end=end_value, + end_side=end_side, + over="OVER", + ) + spec = self._minimize_spec(op.start, op.end, spec) + + order = sge.Order(expressions=order_by) if order_by else None + + return sge.Window(this=func, partition_by=group_by, order=order, spec=spec) + compiler = BigQueryCompiler() diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py index 1f67902395..28954003f3 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py @@ -1,367 +1,516 @@ # Contains code from https://github.com/ibis-project/ibis/blob/main/ibis/backends/sql/rewrites.py -"""Some common rewrite functions to be shared between backends.""" +"""Lower the ibis expression graph to a SQL-like relational algebra.""" from __future__ import annotations -from collections import defaultdict +from collections.abc import Mapping +from functools import reduce +import operator +from typing import Any, TYPE_CHECKING +from ibis.common.annotations import attribute from ibis.common.collections import FrozenDict # noqa: TCH001 -from ibis.common.deferred import _, deferred, Item, var -from ibis.common.exceptions import ExpressionError, IbisInputError -from ibis.common.graph import Node as Traversable -from ibis.common.graph import traverse -from ibis.common.grounds import Concrete -from ibis.common.patterns import Check, pattern, replace +from ibis.common.deferred import var +import ibis.common.exceptions as com +from ibis.common.graph import Graph +from ibis.common.patterns import InstanceOf, Object, Pattern, replace from ibis.common.typing import VarTuple # noqa: TCH001 +import ibis.expr.datatypes as dt import ibis.expr.operations as ops -from ibis.util import Namespace, promote_list +from ibis.expr.rewrites import d, p, replace_parameter +from ibis.expr.schema import Schema +from public import public import toolz -p = Namespace(pattern, module=ops) -d = Namespace(deferred, module=ops) - +if TYPE_CHECKING: + from collections.abc import Sequence x = var("x") y = var("y") -name = var("name") - - -class DerefMap(Concrete, Traversable): - """Trace and replace fields from earlier relations in the hierarchy. - In order to provide a nice user experience, we need to allow expressions - from earlier relations in the hierarchy. Consider the following example: - t = ibis.table([('a', 'int64'), ('b', 'string')], name='t') - t1 = t.select([t.a, t.b]) - t2 = t1.filter(t.a > 0) # note that not t1.a is referenced here - t3 = t2.select(t.a) # note that not t2.a is referenced here - However the relational operations in the IR are strictly enforcing that - the expressions are referencing the immediate parent only. So we need to - track fields upwards the hierarchy to replace `t.a` with `t1.a` and `t2.a` - in the example above. This is called dereferencing. - Whether we can treat or not a field of a relation semantically equivalent - with a field of an earlier relation in the hierarchy depends on the - `.values` mapping of the relation. Leaf relations, like `t` in the example - above, have an empty `.values` mapping, so we cannot dereference fields - from them. On the other hand a projection, like `t1` in the example above, - has a `.values` mapping like `{'a': t.a, 'b': t.b}`, so we can deduce that - `t1.a` is semantically equivalent with `t.a` and so on. - """ - """The relations we want the values to point to.""" - rels: VarTuple[ops.Relation] - - """Substitution mapping from values of earlier relations to the fields of `rels`.""" - subs: FrozenDict[ops.Value, ops.Field] - - """Ambiguous field references.""" - ambigs: FrozenDict[ops.Value, VarTuple[ops.Value]] - - @classmethod - def from_targets(cls, rels, extra=None): - """Create a dereference map from a list of target relations. - Usually a single relation is passed except for joins where multiple - relations are involved. - Parameters - ---------- - rels : list of ops.Relation - The target relations to dereference to. - extra : dict, optional - Extra substitutions to be added to the dereference map. - Returns - ------- - DerefMap - """ - rels = promote_list(rels) - mapping = defaultdict(dict) - for rel in rels: - for field in rel.fields.values(): - for value, distance in cls.backtrack(field): - mapping[value][field] = distance - - subs, ambigs = {}, {} - for from_, to in mapping.items(): - mindist = min(to.values()) - minkeys = [k for k, v in to.items() if v == mindist] - # if all the closest fields are from the same relation, then we - # can safely substitute them and we pick the first one arbitrarily - if all(minkeys[0].relations == k.relations for k in minkeys): - subs[from_] = minkeys[0] - else: - ambigs[from_] = minkeys - - if extra is not None: - subs.update(extra) - - return cls(rels, subs, ambigs) - - @classmethod - def backtrack(cls, value): - """Backtrack the field in the relation hierarchy. - The field is traced back until no modification is made, so only follow - ops.Field nodes not arbitrary values. - Parameters - ---------- - value : ops.Value - The value to backtrack. - Yields - ------ - tuple[ops.Field, int] - The value node and the distance from the original value. - """ - distance = 0 - # track down the field in the hierarchy until no modification - # is made so only follow ops.Field nodes not arbitrary values; - while isinstance(value, ops.Field): - yield value, distance - value = value.rel.values.get(value.name) - distance += 1 - if ( - value is not None - and value.relations - and not value.find(ops.Impure, filter=ops.Value) - ): - yield value, distance - - def dereference(self, value): - """Dereference a value to the target relations. - Also check for ambiguous field references. If a field reference is found - which is marked as ambiguous, then raise an error. - Parameters - ---------- - value : ops.Value - The value to dereference. - Returns - ------- - ops.Value - The dereferenced value. - """ - ambigs = value.find(lambda x: x in self.ambigs, filter=ops.Value) - if ambigs: - raise IbisInputError( - f"Ambiguous field reference {ambigs!r} in expression {value!r}" - ) - return value.replace(self.subs, filter=ops.Value) - - -def flatten_predicates(node): - """Yield the expressions corresponding to the `And` nodes of a predicate. - Examples - -------- - >>> import ibis - >>> t = ibis.table([("a", "int64"), ("b", "string")], name="t") - >>> filt = (t.a == 1) & (t.b == "foo") - >>> predicates = flatten_predicates(filt.op()) - >>> len(predicates) - 2 - >>> predicates[0].to_expr().name("left") - r0 := UnboundTable: t - a int64 - b string - left: r0.a == 1 - >>> predicates[1].to_expr().name("right") - r0 := UnboundTable: t - a int64 - b string - right: r0.b == 'foo' + +@public +class CTE(ops.Relation): + """Common table expression.""" + + parent: ops.Relation + + @attribute + def schema(self): + return self.parent.schema + + @attribute + def values(self): + return self.parent.values + + +@public +class Select(ops.Relation): + """Relation modelled after SQL's SELECT statement.""" + + parent: ops.Relation + selections: FrozenDict[str, ops.Value] = {} + predicates: VarTuple[ops.Value[dt.Boolean]] = () + qualified: VarTuple[ops.Value[dt.Boolean]] = () + sort_keys: VarTuple[ops.SortKey] = () + + def is_star_selection(self): + return tuple(self.values.items()) == tuple(self.parent.fields.items()) + + @attribute + def values(self): + return self.selections + + @attribute + def schema(self): + return Schema({k: v.dtype for k, v in self.selections.items()}) + + +@public +class FirstValue(ops.Analytic): + """Retrieve the first element.""" + + arg: ops.Column[dt.Any] + + @attribute + def dtype(self): + return self.arg.dtype + + +@public +class LastValue(ops.Analytic): + """Retrieve the last element.""" + + arg: ops.Column[dt.Any] + + @attribute + def dtype(self): + return self.arg.dtype + + +# TODO(kszucs): there is a better strategy to rewrite the relational operations +# to Select nodes by wrapping the leaf nodes in a Select node and then merging +# Project, Filter, Sort, etc. incrementally into the Select node. This way we +# can have tighter control over simplification logic. + + +@replace(p.Project) +def project_to_select(_, **kwargs): + """Convert a Project node to a Select node.""" + return Select(_.parent, selections=_.values) + + +def partition_predicates(predicates): + qualified = [] + unqualified = [] + + for predicate in predicates: + if predicate.find(ops.WindowFunction, filter=ops.Value): + qualified.append(predicate) + else: + unqualified.append(predicate) + + return unqualified, qualified + + +@replace(p.Filter) +def filter_to_select(_, **kwargs): + """Convert a Filter node to a Select node.""" + predicates, qualified = partition_predicates(_.predicates) + return Select( + _.parent, selections=_.values, predicates=predicates, qualified=qualified + ) + + +@replace(p.Sort) +def sort_to_select(_, **kwargs): + """Convert a Sort node to a Select node.""" + return Select(_.parent, selections=_.values, sort_keys=_.keys) + + +if hasattr(p, "DropColumns"): + + @replace(p.DropColumns) + def drop_columns_to_select(_, **kwargs): + """Convert a DropColumns node to a Select node.""" + # if we're dropping fewer than 50% of the parent table's columns then the + # compiled query will likely be smaller than if we list everything *NOT* + # being dropped + if len(_.columns_to_drop) < len(_.schema) // 2: + return _ + return Select(_.parent, selections=_.values) + + +if hasattr(p, "FillNull"): + + @replace(p.FillNull) + def fill_null_to_select(_, **kwargs): + """Rewrite FillNull to a Select node.""" + if isinstance(_.replacements, Mapping): + mapping = _.replacements + else: + mapping = { + name: _.replacements + for name, type in _.parent.schema.items() + if type.nullable + } + + if not mapping: + return _.parent + + selections = {} + for name in _.parent.schema.names: + col = ops.Field(_.parent, name) + if (value := mapping.get(name)) is not None: + col = ops.Alias(ops.Coalesce((col, value)), name) + selections[name] = col + + return Select(_.parent, selections=selections) + + +if hasattr(p, "DropNull"): + + @replace(p.DropNull) + def drop_null_to_select(_, **kwargs): + """Rewrite DropNull to a Select node.""" + if _.subset is None: + columns = [ops.Field(_.parent, name) for name in _.parent.schema.names] + else: + columns = _.subset + + if columns: + preds = [ + reduce( + ops.And if _.how == "any" else ops.Or, + [ops.NotNull(c) for c in columns], + ) + ] + elif _.how == "all": + preds = [ops.Literal(False, dtype=dt.bool)] + else: + return _.parent + + return Select(_.parent, selections=_.values, predicates=tuple(preds)) + + +@replace(p.WindowFunction(p.First | p.Last)) +def first_to_firstvalue(_, **kwargs): + """Convert a First or Last node to a FirstValue or LastValue node.""" + if _.func.where is not None: + raise com.UnsupportedOperationError( + f"`{type(_.func).__name__.lower()}` with `where` is unsupported " + "in a window function" + ) + klass = FirstValue if isinstance(_.func, ops.First) else LastValue + return _.copy(func=klass(_.func.arg)) + + +def complexity(node): + """Assign a complexity score to a node. + + Subsequent projections can be merged into a single projection by replacing + the fields referenced in the outer projection with the computed expressions + from the inner projection. This inlining can result in very complex value + expressions depending on the projections. In order to prevent excessive + inlining, we assign a complexity score to each node. + + The complexity score assigns 1 to each value expression and adds up in the + tree hierarchy unless there is a Field node where we don't add up the + complexity of the referenced relation. This way we treat fields kind of like + reusable variables considering them less complex than they were inlined. """ - def predicate(node): - if isinstance(node, ops.And): - # proceed and don't yield the node - return True, None + def accum(node, *args): + if isinstance(node, ops.Field): + return 1 else: - # halt and yield the node - return False, node - - return list(traverse(predicate, node)) - - -@replace(p.Field(p.JoinChain)) -def peel_join_field(_): - return _.rel.values[_.name] - - -@replace(p.ScalarParameter) -def replace_parameter(_, params, **kwargs): - """Replace scalar parameters with their values.""" - return ops.Literal(value=params[_], dtype=_.dtype) - - -@replace(p.StringSlice) -def lower_stringslice(_, **kwargs): - """Rewrite StringSlice in terms of Substring.""" - if _.end is None: - return ops.Substring(_.arg, start=_.start) - if _.start is None: - return ops.Substring(_.arg, start=0, length=_.end) - if ( - isinstance(_.start, ops.Literal) - and isinstance(_.start.value, int) - and isinstance(_.end, ops.Literal) - and isinstance(_.end.value, int) - ): - # optimization for constant values - length = _.end.value - _.start.value - else: - length = ops.Subtract(_.end, _.start) - return ops.Substring(_.arg, start=_.start, length=length) + return 1 + sum(args) + return node.map_nodes(accum)[node] -@replace(p.Analytic) -def wrap_analytic(_, **__): - # Wrap analytic functions in a window function - return ops.WindowFunction(_) +@replace(Object(Select, Object(Select))) +def merge_select_select(_, **kwargs): + """Merge subsequent Select relations into one. -@replace(p.Reduction) -def project_wrap_reduction(_, rel): - # Query all the tables that the reduction depends on - if _.relations == {rel}: - # The reduction is fully originating from the `rel`, so turn - # it into a window function of `rel` - return ops.WindowFunction(_) - else: - # 1. The reduction doesn't depend on any table, constructed from - # scalar values, so turn it into a scalar subquery. - # 2. The reduction is originating from `rel` and other tables, - # so this is a correlated scalar subquery. - # 3. The reduction is originating entirely from other tables, - # so this is an uncorrelated scalar subquery. - return ops.ScalarSubquery(_.to_expr().as_table()) - - -def rewrite_project_input(value, relation): - # we need to detect reductions which are either turned into window functions - # or scalar subqueries depending on whether they are originating from the - # relation - return value.replace( - wrap_analytic | project_wrap_reduction, - filter=p.Value & ~p.WindowFunction, - context={"rel": relation}, + This rewrites eliminates `_.parent` by merging the outer and the inner + `predicates`, `sort_keys` and keeping the outer `selections`. All selections + from the inner Select are inlined into the outer Select. + """ + # don't merge if either the outer or the inner select has window functions + blocking = ( + ops.WindowFunction, + ops.ExistsSubquery, + ops.InSubquery, + ops.Unnest, + ops.Impure, ) + if _.find_below(blocking, filter=ops.Value): + return _ + if _.parent.find_below(blocking, filter=ops.Value): + return _ + subs = {ops.Field(_.parent, k): v for k, v in _.parent.values.items()} + selections = {k: v.replace(subs, filter=ops.Value) for k, v in _.selections.items()} -ReductionLike = p.Reduction | p.Field(p.Aggregate(groups={})) + predicates = tuple(p.replace(subs, filter=ops.Value) for p in _.predicates) + unique_predicates = toolz.unique(_.parent.predicates + predicates) + qualified = tuple(p.replace(subs, filter=ops.Value) for p in _.qualified) + unique_qualified = toolz.unique(_.parent.qualified + qualified) -@replace(ReductionLike) -def filter_wrap_reduction(_): - # Wrap reductions or fields referencing an aggregation without a group by - - # which are scalar fields - in a scalar subquery. In the latter case we - # use the reduction value from the aggregation. - if isinstance(_, ops.Field): - value = _.rel.values[_.name] - else: - value = _ - return ops.ScalarSubquery(value.to_expr().as_table()) + sort_keys = tuple(s.replace(subs, filter=ops.Value) for s in _.sort_keys) + sort_key_exprs = {s.expr for s in sort_keys} + parent_sort_keys = tuple( + k for k in _.parent.sort_keys if k.expr not in sort_key_exprs + ) + unique_sort_keys = sort_keys + parent_sort_keys + + result = Select( + _.parent.parent, + selections=selections, + predicates=unique_predicates, + qualified=unique_qualified, + sort_keys=unique_sort_keys, + ) + return result if complexity(result) <= complexity(_) else _ -def rewrite_filter_input(value): - return value.replace( - wrap_analytic | filter_wrap_reduction, filter=p.Value & ~p.WindowFunction +def extract_ctes(node: ops.Relation) -> set[ops.Relation]: + cte_types = (Select, ops.Aggregate, ops.JoinChain, ops.Set, ops.Limit, ops.Sample) + dont_count = (ops.Field, ops.CountStar, ops.CountDistinctStar) + + g = Graph.from_bfs(node, filter=~InstanceOf(dont_count)) + result = set() + for op, dependents in g.invert().items(): + if isinstance(op, ops.View) or ( + len(dependents) > 1 and isinstance(op, cte_types) + ): + result.add(op) + + return result + + +def sqlize( + node: ops.Node, + params: Mapping[ops.ScalarParameter, Any], + rewrites: Sequence[Pattern] = (), + fuse_selects: bool = True, +) -> tuple[ops.Node, list[ops.Node]]: + """Lower the ibis expression graph to a SQL-like relational algebra. + + Parameters + ---------- + node + The root node of the expression graph. + params + A mapping of scalar parameters to their values. + rewrites + Supplementary rewrites to apply to the expression graph. + fuse_selects + Whether to merge subsequent Select nodes into one where possible. + + Returns + ------- + Tuple of the rewritten expression graph and a list of CTEs. + + """ + assert isinstance(node, ops.Relation) + + # apply the backend specific rewrites + if rewrites: + node = node.replace(reduce(operator.or_, rewrites)) + + # lower the expression graph to a SQL-like relational algebra + context = {"params": params} + replacements = ( + replace_parameter | project_to_select | filter_to_select | sort_to_select ) + if hasattr(p, "FillNull"): + replacements = replacements | fill_null_to_select + + if hasattr(p, "DropNull"): + replacements = replacements | drop_null_to_select -@replace(p.Analytic | p.Reduction) -def window_wrap_reduction(_, window): - # Wrap analytic and reduction functions in a window function. Used in the - # value.over() API. - return ops.WindowFunction( - _, - how=window.how, - start=window.start, - end=window.end, - group_by=window.groupings, - order_by=window.orderings, + if hasattr(p, "DropColumns"): + replacements = replacements | drop_columns_to_select + + replacements = replacements | first_to_firstvalue + sqlized = node.replace( + replacements, + context=context, ) + # squash subsequent Select nodes into one + if fuse_selects: + simplified = sqlized.replace(merge_select_select) + else: + simplified = sqlized -@replace(p.WindowFunction) -def window_merge_frames(_, window): - # Merge window frames, used in the value.over() and groupby.select() APIs. - if _.how != window.how: - raise ExpressionError( - f"Unable to merge {_.how} window with {window.how} window" - ) - elif _.start and window.start and _.start != window.start: - raise ExpressionError( - "Unable to merge windows with conflicting `start` boundary" - ) - elif _.end and window.end and _.end != window.end: - raise ExpressionError("Unable to merge windows with conflicting `end` boundary") + # extract common table expressions while wrapping them in a CTE node + ctes = extract_ctes(simplified) + + def wrap(node, _, **kwargs): + new = node.__recreate__(kwargs) + return CTE(new) if node in ctes else new + + result = simplified.replace(wrap) + ctes = reversed([cte.parent for cte in result.find(CTE)]) + + return result, ctes + + +# supplemental rewrites selectively used on a per-backend basis + + +@replace(p.WindowFunction(func=p.NTile(y), order_by=())) +def add_order_by_to_empty_ranking_window_functions(_, **kwargs): + """Add an ORDER BY clause to rank window functions that don't have one.""" + return _.copy(order_by=(y,)) + + +"""Replace checks against an empty right side with `False`.""" +empty_in_values_right_side = p.InValues(options=()) >> d.Literal(False, dtype=dt.bool) - start = _.start or window.start - end = _.end or window.end - group_by = tuple(toolz.unique(_.group_by + window.groupings)) - order_keys = {} - for sort_key in window.orderings + _.order_by: - order_keys[sort_key.expr] = sort_key.ascending, sort_key.nulls_first +@replace( + p.WindowFunction(p.RankBase | p.NTile) + | p.StringFind + | p.FindInSet + | p.ArrayPosition +) +def one_to_zero_index(_, **kwargs): + """Subtract one from one-index functions.""" + return ops.Subtract(_, 1) - order_by = ( - ops.SortKey(expr, ascending=ascending, nulls_first=nulls_first) - for expr, (ascending, nulls_first) in order_keys.items() + +@replace(ops.NthValue) +def add_one_to_nth_value_input(_, **kwargs): + if isinstance(_.nth, ops.Literal): + nth = ops.Literal(_.nth.value + 1, dtype=_.nth.dtype) + else: + nth = ops.Add(_.nth, 1) + return _.copy(nth=nth) + + +@replace(p.WindowFunction(order_by=())) +def rewrite_empty_order_by_window(_, **kwargs): + return _.copy(order_by=(ops.NULL,)) + + +@replace(p.WindowFunction(p.RowNumber | p.NTile)) +def exclude_unsupported_window_frame_from_row_number(_, **kwargs): + return ops.Subtract(_.copy(start=None, end=0), 1) + + +@replace(p.WindowFunction(p.MinRank | p.DenseRank, start=None)) +def exclude_unsupported_window_frame_from_rank(_, **kwargs): + return ops.Subtract( + _.copy(start=None, end=0, order_by=_.order_by or (ops.NULL,)), 1 ) - return _.copy(start=start, end=end, group_by=group_by, order_by=order_by) -def rewrite_window_input(value, window): - context = {"window": window} - # if self is a reduction or analytic function, wrap it in a window function - node = value.replace( - window_wrap_reduction, - filter=p.Value & ~p.WindowFunction, - context=context, +@replace( + p.WindowFunction( + p.Lag | p.Lead | p.PercentRank | p.CumeDist | p.Any | p.All, start=None ) - # if self is already a window function, merge the existing window frame - # with the requested window frame - return node.replace(window_merge_frames, filter=p.Value, context=context) - - -# TODO(kszucs): schema comparison should be updated to not distinguish between -# different column order -@replace(p.Project(y @ p.Relation) & Check(_.schema == y.schema)) -def complete_reprojection(_, y): - # TODO(kszucs): this could be moved to the pattern itself but not sure how - # to express it, especially in a shorter way then the following check - for name in _.schema: - if _.values[name] != ops.Field(y, name): - return _ - return y +) +def exclude_unsupported_window_frame_from_ops(_, **kwargs): + return _.copy(start=None, end=0, order_by=_.order_by or (ops.NULL,)) + + +# Rewrite rules for lowering a high-level operation into one composed of more +# primitive operations. + + +@replace(p.Log2) +def lower_log2(_, **kwargs): + """Rewrite `log2` as `log`.""" + return ops.Log(_.arg, base=2) -@replace(p.Project(y @ p.Project)) -def subsequent_projects(_, y): - rule = p.Field(y, name) >> Item(y.values, name) - values = {k: v.replace(rule, filter=ops.Value) for k, v in _.values.items()} - return ops.Project(y.parent, values) +@replace(p.Log10) +def lower_log10(_, **kwargs): + """Rewrite `log10` as `log`.""" + return ops.Log(_.arg, base=10) -@replace(p.Filter(y @ p.Filter)) -def subsequent_filters(_, y): - rule = p.Field(y, name) >> d.Field(y.parent, name) - preds = tuple(v.replace(rule, filter=ops.Value) for v in _.predicates) - return ops.Filter(y.parent, y.predicates + preds) +@replace(p.Bucket) +def lower_bucket(_, **kwargs): + """Rewrite `Bucket` as `SearchedCase`.""" + cases = [] + results = [] + if _.closed == "left": + l_cmp = ops.LessEqual + r_cmp = ops.Less + else: + l_cmp = ops.Less + r_cmp = ops.LessEqual + + user_num_buckets = len(_.buckets) - 1 + + bucket_id = 0 + if _.include_under: + if user_num_buckets > 0: + cmp = ops.Less if _.close_extreme else r_cmp + else: + cmp = ops.LessEqual if _.closed == "right" else ops.Less + cases.append(cmp(_.arg, _.buckets[0])) + results.append(bucket_id) + bucket_id += 1 + + for j, (lower, upper) in enumerate(zip(_.buckets, _.buckets[1:])): + if _.close_extreme and ( + (_.closed == "right" and j == 0) + or (_.closed == "left" and j == (user_num_buckets - 1)) + ): + cases.append( + ops.And(ops.LessEqual(lower, _.arg), ops.LessEqual(_.arg, upper)) + ) + results.append(bucket_id) + else: + cases.append(ops.And(l_cmp(lower, _.arg), r_cmp(_.arg, upper))) + results.append(bucket_id) + bucket_id += 1 -@replace(p.Filter(y @ p.Project)) -def reorder_filter_project(_, y): - rule = p.Field(y, name) >> Item(y.values, name) - preds = tuple(v.replace(rule, filter=ops.Value) for v in _.predicates) + if _.include_over: + if user_num_buckets > 0: + cmp = ops.Less if _.close_extreme else l_cmp + else: + cmp = ops.Less if _.closed == "right" else ops.LessEqual + + cases.append(cmp(_.buckets[-1], _.arg)) + results.append(bucket_id) + bucket_id += 1 + + return ops.SearchedCase( + cases=tuple(cases), results=tuple(results), default=ops.NULL + ) - inner = ops.Filter(y.parent, preds) - rule = p.Field(y.parent, name) >> d.Field(inner, name) - projs = {k: v.replace(rule, filter=ops.Value) for k, v in y.values.items()} - return ops.Project(inner, projs) +@replace(p.Capitalize) +def lower_capitalize(_, **kwargs): + """Rewrite Capitalize in terms of substring, concat, upper, and lower.""" + first = ops.Uppercase(ops.Substring(_.arg, start=0, length=1)) + # use length instead of length - 1 to avoid backends complaining about + # asking for negative length + # + # there are at most length - 1 characters, so asking for length is fine + rest = ops.Lowercase(ops.Substring(_.arg, start=1, length=ops.StringLength(_.arg))) + return ops.StringConcat((first, rest)) -def simplify(node): - # TODO(kszucs): add a utility to the graph module to do rewrites in multiple - # passes after each other - node = node.replace(reorder_filter_project) - node = node.replace(reorder_filter_project) - node = node.replace(subsequent_projects | subsequent_filters) - node = node.replace(complete_reprojection) - return node +@replace(p.Sample) +def lower_sample(_, **kwargs): + """Rewrite Sample as `t.filter(random() <= fraction)`. + + Errors as unsupported if a `seed` is specified. + """ + if _.seed is not None: + raise com.UnsupportedOperationError( + "`Table.sample` with a random seed is unsupported" + ) + return ops.Filter(_.parent, (ops.LessEqual(ops.RandomScalar(), _.fraction),)) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 3ae5fc10e4..2c2efe528d 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -2,6 +2,5 @@ from __future__ import annotations from bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 -from bigframes_vendored.ibis.expr.operations.arrays import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/arrays.py b/third_party/bigframes_vendored/ibis/expr/operations/arrays.py deleted file mode 100644 index a0ad915a9b..0000000000 --- a/third_party/bigframes_vendored/ibis/expr/operations/arrays.py +++ /dev/null @@ -1,18 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/arrays.py -from __future__ import annotations - -import ibis.expr.datatypes as dt -from ibis.expr.operations.core import Unary - - -class GenerateArray(Unary): - """ - Generates an array of values, similar to ibis.range(), but with simpler and - more efficient SQL generation. - """ - - dtype = dt.Array(dt.int64) - - -class SafeCastToDatetime(Unary): - dtype = dt.Timestamp(timezone=None) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index 1eb0554137..ea1f766a71 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -5,5 +5,7 @@ import ibis.expr.operations.core as ibis_ops_core +# TODO(swast): Remove once supported upstream. +# See: https://github.com/ibis-project/ibis/issues/9542 class ToJsonString(ibis_ops_core.Unary): dtype = dt.string diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index bd971e408a..a428c73449 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -9,17 +9,6 @@ from ibis.expr.operations.reductions import Filterable, Reduction -class ApproximateMultiQuantile(Filterable, Reduction): - """Calculate (approximately) evenly-spaced quantiles. - - See: https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles - """ - - arg: ibis_ops_core.Value - num_bins: ibis_ops_core.Value[dt.Int64] - dtype = dt.Array(dt.float64) - - class ArrayAggregate(Filterable, Reduction): """ Collects the elements of this expression into an ordered array. Similar to @@ -34,4 +23,4 @@ def dtype(self): return dt.Array(self.arg.dtype) -__all__ = ["ApproximateMultiQuantile", "ArrayAggregate"] +__all__ = ["ArrayAggregate"] diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index fe1c8a12ff..e7f555c729 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -544,7 +544,7 @@ def to_dict( >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() - {'col1': {0: 1, 1: 2}, 'col2': {0: 3, 1: 4}} + {'col1': {np.int64(0): 1, np.int64(1): 2}, 'col2': {np.int64(0): 3, np.int64(1): 4}} You can specify the return orientation. @@ -1773,7 +1773,7 @@ def iterrows(self): ... }) >>> index, row = next(df.iterrows()) >>> index - 0 + np.int64(0) >>> row A 1 B 4 @@ -1794,7 +1794,7 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): ... 'B': [4, 5, 6], ... }) >>> next(df.itertuples(name="Pair")) - Pair(Index=0, A=1, B=4) + Pair(Index=np.int64(0), A=np.int64(1), B=np.int64(4)) Args: index (bool, default True): @@ -6486,12 +6486,12 @@ def iat(self): Get value at specified row/column pair >>> df.iat[1, 2] - 1 + np.int64(1) Get value within a series >>> df.loc[0].iat[1] - 2 + np.int64(2) Returns: bigframes.core.indexers.IatDataFrameIndexer: Indexers object. @@ -6519,12 +6519,12 @@ def at(self): Get value at specified row/column pair >>> df.at[4, 'B'] - 2 + np.int64(2) Get value within a series >>> df.loc[5].at['B'] - 4 + np.int64(4) Returns: bigframes.core.indexers.AtDataFrameIndexer: Indexers object. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a30ed9cd92..a6363e3285 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -205,7 +205,7 @@ def hasnans(self) -> bool: 3 dtype: Float64 >>> s.hasnans - True + np.True_ Returns: bool @@ -592,7 +592,7 @@ def agg(self, func): dtype: Int64 >>> s.agg('min') - 1 + np.int64(1) >>> s.agg(['min', 'max']) min 1 @@ -626,7 +626,7 @@ def count(self): 2 dtype: Float64 >>> s.count() - 2 + np.int64(2) Returns: int or Series (if level specified): Number of non-null values in the @@ -834,12 +834,12 @@ def corr(self, other, method="pearson", min_periods=None) -> float: >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) >>> s1.corr(s2) - -0.8510644963469901 + np.float64(-0.8510644963469901) >>> s1 = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s2 = bpd.Series([1, 2, 3], index=[2, 1, 0]) >>> s1.corr(s2) - -1.0 + np.float64(-1.0) Args: other (Series): @@ -870,15 +870,15 @@ def autocorr(self, lag: int = 1) -> float: >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS - 0.10355... + np.float64(0.10355263309024067) >>> s.autocorr(lag=2) - -1.0 + np.float64(-1.0) If the Pearson correlation is not well defined, then 'NaN' is returned. >>> s = bpd.Series([1, 0, 0, 0]) >>> s.autocorr() - nan + np.float64(nan) Args: lag (int, default 1): @@ -951,12 +951,12 @@ def dot(self, other) -> Series | np.ndarray: >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) >>> s.dot(other) - 8 + np.int64(8) You can also use the operator ``@`` for the dot product: >>> s @ other - 8 + np.int64(8) Args: other (Series): @@ -3080,7 +3080,7 @@ def max( 1 3 dtype: Int64 >>> s.max() - 3 + np.int64(3) Calculating the max of a Series containing ``NA`` values: @@ -3091,7 +3091,7 @@ def max( 2 dtype: Int64 >>> s.max() - 3 + np.int64(3) Returns: scalar: Scalar. @@ -3120,7 +3120,7 @@ def min( 1 3 dtype: Int64 >>> s.min() - 1 + np.int64(1) Calculating the min of a Series containing ``NA`` values: @@ -3131,7 +3131,7 @@ def min( 2 dtype: Int64 >>> s.min() - 1 + np.int64(1) Returns: scalar: Scalar. @@ -3207,7 +3207,7 @@ def sum(self): 1 3 dtype: Int64 >>> s.sum() - 4 + np.int64(4) Calculating the sum of a Series containing ``NA`` values: @@ -3218,7 +3218,7 @@ def sum(self): 2 dtype: Int64 >>> s.sum() - 4 + np.int64(4) Returns: scalar: Scalar. @@ -3241,7 +3241,7 @@ def mean(self): 1 3 dtype: Int64 >>> s.mean() - 2.0 + np.float64(2.0) Calculating the mean of a Series containing ``NA`` values: @@ -3252,7 +3252,7 @@ def mean(self): 2 dtype: Int64 >>> s.mean() - 2.0 + np.float64(2.0) Returns: scalar: Scalar. @@ -3285,7 +3285,7 @@ def quantile( >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) - 2.5 + np.float64(2.5) >>> s.quantile([.25, .5, .75]) 0.25 1.75 0.5 2.5 @@ -3576,10 +3576,10 @@ def argmax(self): dtype: Float64 >>> s.argmax() - 2 + np.int64(2) >>> s.argmin() - 0 + np.int64(0) The maximum cereal calories is the third element and the minimum cereal calories is the first element, since series is zero-indexed. @@ -3612,10 +3612,10 @@ def argmin(self): dtype: Float64 >>> s.argmax() - 2 + np.int64(2) >>> s.argmin() - 0 + np.int64(0) The maximum cereal calories is the third element and the minimum cereal calories is the first element, since series is zero-indexed. @@ -3887,11 +3887,11 @@ def is_monotonic_increasing(self) -> bool: >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing - True + np.True_ >>> s = bpd.Series([3, 2, 1]) >>> s.is_monotonic_increasing - False + np.False_ Returns: bool: Boolean. @@ -3910,11 +3910,11 @@ def is_monotonic_decreasing(self) -> bool: >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing - True + np.True_ >>> s = bpd.Series([1, 2, 3]) >>> s.is_monotonic_decreasing - False + np.False_ Returns: bool: Boolean. @@ -4041,7 +4041,7 @@ def iat(self): Get value at specified row number >>> s.iat[1] - 2 + np.int64(2) Returns: bigframes.core.indexers.IatSeriesIndexer: Indexers object. @@ -4066,7 +4066,7 @@ def at(self): Get value at specified row label >>> s.at['B'] - 2 + np.int64(2) Returns: @@ -4314,7 +4314,7 @@ def __getitem__(self, indexer): >>> s = bpd.Series([15, 30, 45]) >>> s[1] - 30 + np.int64(30) >>> s[0:2] 0 15 1 30 diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 8e8b2c1952..c1a909e849 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -36,13 +36,13 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: >>> y_pred = bpd.DataFrame([0, 1, 2, 3]) >>> accuracy_score = bigframes.ml.metrics.accuracy_score(y_true, y_pred) >>> accuracy_score - 0.5 + np.float64(0.5) If False, return the number of correctly classified samples: >>> accuracy_score = bigframes.ml.metrics.accuracy_score(y_true, y_pred, normalize=False) >>> accuracy_score - 2 + np.int64(2) Args: y_true (Series or DataFrame of shape (n_samples,)): diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index dee8b350c0..7b97526de2 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -37,7 +37,7 @@ def auc(x, y) -> float: >>> y = bpd.DataFrame([2, 3, 4, 5]) >>> auc = bigframes.ml.metrics.auc(x, y) >>> auc - 3.5 + np.float64(3.5) The input can be Series: @@ -47,7 +47,7 @@ def auc(x, y) -> float: ... ) >>> auc = bigframes.ml.metrics.auc(df["x"], df["y"]) >>> auc - 3.5 + np.float64(3.5) Args: @@ -77,7 +77,7 @@ def roc_auc_score(y_true, y_score) -> float: >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) >>> roc_auc_score = bigframes.ml.metrics.roc_auc_score(y_true, y_score) >>> roc_auc_score - 0.625 + np.float64(0.625) The input can be Series: @@ -87,7 +87,7 @@ def roc_auc_score(y_true, y_score) -> float: ... ) >>> roc_auc_score = bigframes.ml.metrics.roc_auc_score(df["y_true"], df["y_score"]) >>> roc_auc_score - 0.625 + np.float64(0.625) Args: y_true (Series or DataFrame of shape (n_samples,)): diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index c3e579bd29..56f78c6d0b 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -52,7 +52,7 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) >>> r2_score = bigframes.ml.metrics.r2_score(y_true, y_pred) >>> r2_score - 0.9486081370449679 + np.float64(0.9486081370449679) Args: y_true (Series or DataFrame of shape (n_samples,)): @@ -79,7 +79,7 @@ def mean_squared_error(y_true, y_pred) -> float: >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) >>> mse = bigframes.ml.metrics.mean_squared_error(y_true, y_pred) >>> mse - 0.375 + np.float64(0.375) Args: y_true (Series or DataFrame of shape (n_samples,)): From 596b03bb3ea27cead9b90200b9ef3cdcd99ca184 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 13 Sep 2024 15:54:22 -0700 Subject: [PATCH 09/18] fix: Fix a bug that raises exception when re-indexing columns with their original order (#988) --- bigframes/dataframe.py | 5 +++++ tests/system/small/test_dataframe.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d7738a569d..29f22c28b9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1913,6 +1913,11 @@ def _reindex_rows( def _reindex_columns(self, columns): block = self._block new_column_index, indexer = self.columns.reindex(columns) + + if indexer is None: + # The new index is the same as the old one. Do nothing. + return self + result_cols = [] for label, index in zip(columns, indexer): if index >= 0: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9e046dc62e..b9291085fa 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3664,6 +3664,21 @@ def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): ) +def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): + # First, make sure the two dataframes have the same columns in order. + columns = ["int64_col", "int64_too"] + bf = scalars_df_index[columns] + pd_df = scalars_pandas_df_index[columns] + + bf_result = bf.reindex(columns=columns).to_pandas() + pd_result = pd_df.reindex(columns=columns) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): unsupported = [ "geography_col", From deac6d2d6e459b26c05f6e5ff328ea03a3cff45f Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 11:09:11 -0700 Subject: [PATCH 10/18] feat: add "include" param to describe for string types (#973) --- bigframes/dataframe.py | 88 ++++++++++++++++----- bigframes/operations/aggregations.py | 11 +++ tests/system/small/test_dataframe.py | 81 +++++++++++++++++++ tests/unit/operations/__init__.py | 13 +++ tests/unit/operations/test_aggregations.py | 92 ++++++++++++++++++++++ 5 files changed, 267 insertions(+), 18 deletions(-) create mode 100644 tests/unit/operations/__init__.py create mode 100644 tests/unit/operations/test_aggregations.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 29f22c28b9..c4597ab843 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -18,6 +18,7 @@ import datetime import inspect +import itertools import re import sys import textwrap @@ -70,6 +71,7 @@ import bigframes.exceptions import bigframes.formatting_helpers as formatter import bigframes.operations as ops +import bigframes.operations.aggregations import bigframes.operations.aggregations as agg_ops import bigframes.operations.plotting as plotting import bigframes.operations.structs @@ -2207,14 +2209,17 @@ def agg( self, func: str | typing.Sequence[str] ) -> DataFrame | bigframes.series.Series: if utils.is_list_like(func): - if any( - dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE - for dtype in self.dtypes - ): - raise NotImplementedError( - f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}" - ) aggregations = [agg_ops.lookup_agg_func(f) for f in func] + + for dtype, agg in itertools.product(self.dtypes, aggregations): + if not bigframes.operations.aggregations.is_agg_op_supported( + dtype, agg + ): + raise NotImplementedError( + f"Type {dtype} does not support aggregation {agg}. " + f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}" + ) + return DataFrame( self._block.summarize( self._block.value_columns, @@ -2280,16 +2285,55 @@ def melt( self._block.melt(id_col_ids, val_col_ids, var_name, value_name) ) - def describe(self) -> DataFrame: - df_numeric = self._drop_non_numeric(permissive=False) - if len(df_numeric.columns) == 0: - raise NotImplementedError( - f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" + _NUMERICAL_DISCRIBE_AGGS = ( + "count", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ) + _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique") + + def describe(self, include: None | Literal["all"] = None) -> DataFrame: + if include is None: + numeric_df = self._drop_non_numeric(permissive=False) + if len(numeric_df.columns) == 0: + # Describe eligible non-numerical columns + result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS) + else: + # Otherwise, only describe numerical columns + result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS) + return typing.cast(DataFrame, result) + + elif include == "all": + numeric_result = typing.cast( + DataFrame, + self._drop_non_numeric(permissive=False).agg( + self._NUMERICAL_DISCRIBE_AGGS + ), + ) + string_result = typing.cast( + DataFrame, + self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS), ) - result = df_numeric.agg( - ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] - ) - return typing.cast(DataFrame, result) + + if len(numeric_result.columns) == 0: + return string_result + elif len(string_result.columns) == 0: + return numeric_result + else: + import bigframes.core.reshape as rs + + # Use reindex after join to preserve the original column order. + return rs.concat( + [numeric_result, string_result], axis=1 + )._reindex_columns(self.columns) + + else: + raise ValueError(f"Unsupported include type: {include}") def skew(self, *, numeric_only: bool = False): if not numeric_only: @@ -2487,7 +2531,7 @@ def unstack(self, level: LevelsType = -1): return DataFrame(pivot_block) def _drop_non_numeric(self, permissive=True) -> DataFrame: - types_to_keep = ( + numerical_types = ( set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) if permissive else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) @@ -2495,10 +2539,18 @@ def _drop_non_numeric(self, permissive=True) -> DataFrame: non_numeric_cols = [ col_id for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype not in types_to_keep + if dtype not in numerical_types ] return DataFrame(self._block.drop_columns(non_numeric_cols)) + def _drop_non_string(self) -> DataFrame: + string_cols = [ + col_id + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + if dtype == bigframes.dtypes.STRING_DTYPE + ] + return DataFrame(self._block.select_columns(string_cols)) + def _drop_non_bool(self) -> DataFrame: non_bool_cols = [ col_id diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 675ead1188..f20429e449 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -562,3 +562,14 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") + + +def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool: + if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: + return True + + if dtype == dtypes.STRING_DTYPE: + return isinstance(op, (CountOp, NuniqueOp)) + + # For all other types, support no aggregation + return False diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b9291085fa..fe63a1ed28 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2612,6 +2612,87 @@ def test_df_describe(scalars_dfs): ).all() +@skip_legacy_pandas +@pytest.mark.parametrize("include", [None, "all"]) +def test_df_describe_non_numerical(scalars_dfs, include): + scalars_df, scalars_pandas_df = scalars_dfs + + non_numerical_columns = ["string_col"] + + modified_bf = scalars_df[non_numerical_columns] + bf_result = modified_bf.describe(include=include).to_pandas() + + modified_pd_df = scalars_pandas_df[non_numerical_columns] + pd_result = modified_pd_df.describe(include=include) + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique"]) + pd_result = pd_result.reindex( + ["count", "unique"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pd.testing.assert_frame_equal( + pd_result[non_numerical_columns].astype("Int64"), + bf_result[non_numerical_columns], + check_index_type=False, + ) + + +@skip_legacy_pandas +def test_df_describe_mixed_types_include_all(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + numerical_columns = [ + "int64_col", + "float64_col", + ] + non_numerical_columns = ["string_col"] + supported_columns = numerical_columns + non_numerical_columns + + modified_bf = scalars_df[supported_columns] + bf_result = modified_bf.describe(include="all").to_pandas() + + modified_pd_df = scalars_pandas_df[supported_columns] + pd_result = modified_pd_df.describe(include="all") + + # Drop quartiles, as they are approximate + bf_min = bf_result.loc["min", :] + bf_p25 = bf_result.loc["25%", :] + bf_p50 = bf_result.loc["50%", :] + bf_p75 = bf_result.loc["75%", :] + bf_max = bf_result.loc["max", :] + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"]) + pd_result = pd_result.reindex( + ["count", "unique", "mean", "std", "min", "max"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pd.testing.assert_frame_equal( + pd_result[numerical_columns].astype("Float64"), + bf_result[numerical_columns], + check_index_type=False, + ) + + pd.testing.assert_frame_equal( + pd_result[non_numerical_columns].astype("Int64"), + bf_result[non_numerical_columns], + check_index_type=False, + ) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + + def test_df_transpose(): # Include some floats to ensure type coercion values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] diff --git a/tests/unit/operations/__init__.py b/tests/unit/operations/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/unit/operations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py new file mode 100644 index 0000000000..4cb6934c9d --- /dev/null +++ b/tests/unit/operations/test_aggregations.py @@ -0,0 +1,92 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.dtypes as dtypes +from bigframes.operations.aggregations import ( + all_op, + any_op, + count_op, + dense_rank_op, + first_op, + is_agg_op_supported, + max_op, + mean_op, + median_op, + min_op, + nunique_op, + product_op, + rank_op, + size_op, + std_op, + sum_op, + var_op, +) + +_ALL_OPS = set( + [ + size_op, + sum_op, + mean_op, + median_op, + product_op, + max_op, + min_op, + std_op, + var_op, + count_op, + nunique_op, + rank_op, + dense_rank_op, + all_op, + any_op, + first_op, + ] +) +_STRING_SUPPORTED_OPS = set([count_op, nunique_op]) + + +@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) +@pytest.mark.parametrize("op", _ALL_OPS) +def test_is_agg_op_supported_numerical_support_all(dtype, op): + assert is_agg_op_supported(dtype, op) is True + + +@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE]) +@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS) +def test_is_agg_op_supported_string_support_ops(dtype, op): + assert is_agg_op_supported(dtype, op) is True + + +@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE]) +@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS) +def test_is_agg_op_supported_string_not_support_ops(dtype, op): + assert is_agg_op_supported(dtype, op) is False + + +@pytest.mark.parametrize( + "dtype", + [ + dtypes.BYTES_DTYPE, + dtypes.DATE_DTYPE, + dtypes.TIME_DTYPE, + dtypes.DATETIME_DTYPE, + dtypes.TIMESTAMP_DTYPE, + dtypes.GEO_DTYPE, + ], +) +@pytest.mark.parametrize("op", _ALL_OPS) +def test_is_agg_op_supported_non_numerical_no_support(dtype, op): + assert is_agg_op_supported(dtype, op) is False From f7c03dcaf7ee4d62497f6653851e390795fc60a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 16 Sep 2024 14:22:26 -0500 Subject: [PATCH 11/18] feat: add `subset` parameter to `DataFrame.dropna` to select which columns to consider (#981) * feat: add `subset` parameter to `DataFrame.dropna` to select which columns to consider * fix dropna with subset=None * refactor: remove circular dependencies preventing local doctest runs With this change I can once again run ``` pytest --doctest-modules third_party/bigframes_vendored/pandas/core/frame.py ``` Note: having multiple `version.py` files should be fine. release-please will update all such files it finds. * fix doctest * Revert "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 57e8335af83558323d4df213854efc1edcdb35f0, reversing changes made to 197074abc53c5a85c66ec1940e54ea3f2c9c3677. * Reapply "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 0f18294513a2a6a806f5fff4ff38646d4029a884. * loop over tuple result --------- Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- bigframes/core/block_transforms.py | 12 ++++++++-- bigframes/dataframe.py | 22 +++++++++++++++++-- tests/system/small/test_dataframe.py | 21 +++++++++++------- tests/unit/test_dataframe.py | 9 ++++++++ .../bigframes_vendored/pandas/core/frame.py | 17 ++++++++++++++ 5 files changed, 69 insertions(+), 12 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 1990647e0a..398c6ab26a 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,7 +15,7 @@ import functools import typing -from typing import Sequence +from typing import Optional, Sequence import bigframes_vendored.constants as constants import pandas as pd @@ -488,11 +488,19 @@ def dropna( block: blocks.Block, column_ids: typing.Sequence[str], how: typing.Literal["all", "any"] = "any", + subset: Optional[typing.Sequence[str]] = None, ): """ Drop na entries from block """ - predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids] + if subset is None: + subset = column_ids + + predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] if len(predicates) == 0: return block if how == "any": diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c4597ab843..c2fb9336f3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2027,8 +2027,9 @@ def dropna( self, *, axis: int | str = 0, - inplace: bool = False, how: str = "any", + subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: if inplace: @@ -2040,8 +2041,25 @@ def dropna( axis_n = utils.get_axis_number(axis) + if subset is not None and axis_n != 0: + raise NotImplementedError( + f"subset only supported when axis=0. {constants.FEEDBACK_LINK}" + ) + if axis_n == 0: - result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore + # subset needs to be converted into column IDs, not column labels. + if subset is None: + subset_ids = None + elif not utils.is_list_like(subset): + subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]] + else: + subset_ids = [ + id_ + for label in subset + for id_ in self._block.label_to_col_id[label] + ] + + result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index fe63a1ed28..b4c81bfbef 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs): @skip_legacy_pandas @pytest.mark.parametrize( - ("axis", "how", "ignore_index"), + ("axis", "how", "ignore_index", "subset"), [ - (0, "any", False), - (0, "any", True), - (1, "any", False), - (1, "all", False), + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index): +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 6370d1b987..560c0cf0f4 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -20,6 +20,15 @@ from . import resources +def test_dataframe_dropna_axis_1_subset_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="subset"): + dataframe.dropna(axis=1, subset=["col1", "col2"]) + + def test_dataframe_repr_with_uninitialized_object(): """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e7f555c729..970883257c 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1614,6 +1614,8 @@ def dropna( *, axis: int | str = 0, how: str = "any", + subset=None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: """Remove missing values. @@ -1662,6 +1664,15 @@ def dropna( [3 rows x 3 columns] + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'toy']) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [2 rows x 3 columns] + Args: axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Determine if rows or columns which contain missing values are @@ -1675,6 +1686,12 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. + subset (column label or sequence of labels, optional): + Labels along other axis to consider, e.g. if you are dropping + rows these would be a list of columns to include. + Only supports axis=0. + inplace (bool, default ``False``): + Not supported. ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. From b00db7eb61752f4f93b0244c8eb72af7d24c6524 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 16 Sep 2024 12:52:59 -0700 Subject: [PATCH 12/18] refactor: Make column id namespaces explicit (#982) --- bigframes/core/__init__.py | 89 +++++++++++++++++++++--------- bigframes/core/blocks.py | 72 +++++++++--------------- bigframes/core/compile/compiled.py | 4 +- bigframes/core/compile/compiler.py | 12 ++-- bigframes/core/nodes.py | 54 +++++++++++++++--- bigframes/dataframe.py | 3 +- bigframes/session/executor.py | 10 ++-- bigframes/session/loader.py | 3 +- tests/unit/test_planner.py | 27 +++++---- 9 files changed, 165 insertions(+), 109 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index aa3aa63cc2..8ad00a46c6 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -18,7 +18,7 @@ import functools import io import typing -from typing import Iterable, Optional, Sequence, Tuple +from typing import Iterable, List, Optional, Sequence, Tuple import warnings import google.cloud.bigquery @@ -106,6 +106,7 @@ def from_table( @property def column_ids(self) -> typing.Sequence[str]: + """Returns column ids as strings.""" return self.schema.names @property @@ -182,10 +183,11 @@ def order_by(self, by: Sequence[OrderingExpression]) -> ArrayValue: def reversed(self) -> ArrayValue: return ArrayValue(nodes.ReversedNode(child=self.node)) - def promote_offsets(self, col_id: str) -> ArrayValue: + def promote_offsets(self) -> Tuple[ArrayValue, str]: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ + col_id = self._gen_namespaced_uid() if self.node.order_ambiguous and not (self.session._strictly_ordered): if not self.session._allows_ambiguity: raise ValueError( @@ -197,7 +199,10 @@ def promote_offsets(self, col_id: str) -> ArrayValue: bigframes.exceptions.AmbiguousWindowWarning, ) - return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) + return ( + ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)), + col_id, + ) def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" @@ -205,13 +210,19 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) - def compute_values(self, assignments: Sequence[Tuple[ex.Expression, str]]): - return ArrayValue( - nodes.ProjectionNode(child=self.node, assignments=tuple(assignments)) + def compute_values(self, assignments: Sequence[ex.Expression]): + col_ids = self._gen_namespaced_uids(len(assignments)) + ex_id_pairs = tuple((ex, id) for ex, id in zip(assignments, col_ids)) + return ( + ArrayValue(nodes.ProjectionNode(child=self.node, assignments=ex_id_pairs)), + col_ids, ) - def project_to_id(self, expression: ex.Expression, output_id: str): - return self.compute_values(((expression, output_id),)) + def project_to_id(self, expression: ex.Expression): + array_val, ids = self.compute_values( + [expression], + ) + return array_val, ids[0] def assign(self, source_id: str, destination_id: str) -> ArrayValue: if destination_id in self.column_ids: # Mutate case @@ -234,19 +245,22 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue: def create_constant( self, - destination_id: str, value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> ArrayValue: + ) -> Tuple[ArrayValue, str]: + destination_id = self._gen_namespaced_uid() if pandas.isna(value): # Need to assign a data type when value is NaN. dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE - return ArrayValue( - nodes.ProjectionNode( - child=self.node, - assignments=((ex.const(value, dtype), destination_id),), - ) + return ( + ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=((ex.const(value, dtype), destination_id),), + ) + ), + destination_id, ) def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: @@ -297,11 +311,10 @@ def project_window_op( column_name: str, op: agg_ops.UnaryWindowOp, window_spec: WindowSpec, - output_name=None, *, never_skip_nulls=False, skip_reproject_unsafe: bool = False, - ) -> ArrayValue: + ) -> Tuple[ArrayValue, str]: """ Creates a new expression based on this expression with unary operation applied to one column. column_name: the id of the input column present in the expression @@ -324,16 +337,20 @@ def project_window_op( bigframes.exceptions.AmbiguousWindowWarning, ) - return ArrayValue( - nodes.WindowOpNode( - child=self.node, - column_name=column_name, - op=op, - window_spec=window_spec, - output_name=output_name, - never_skip_nulls=never_skip_nulls, - skip_reproject_unsafe=skip_reproject_unsafe, - ) + output_name = self._gen_namespaced_uid() + return ( + ArrayValue( + nodes.WindowOpNode( + child=self.node, + column_name=column_name, + op=op, + window_spec=window_spec, + output_name=output_name, + never_skip_nulls=never_skip_nulls, + skip_reproject_unsafe=skip_reproject_unsafe, + ) + ), + output_name, ) def _reproject_to_table(self) -> ArrayValue: @@ -410,3 +427,21 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: def get_offset_for_name(self, name: str): return self.schema.names.index(name) + + # Deterministically generate namespaced ids for new variables + # These new ids are only unique within the current namespace. + # Many operations, such as joins, create new namespaces. See: BigFrameNode.defines_namespace + # When migrating to integer ids, these will generate the next available integer, in order to densely pack ids + # this will help represent variables sets as compact bitsets + def _gen_namespaced_uid(self) -> str: + return self._gen_namespaced_uids(1)[0] + + def _gen_namespaced_uids(self, n: int) -> List[str]: + i = len(self.node.defined_variables) + genned_ids: List[str] = [] + while len(genned_ids) < n: + attempted_id = f"col_{i}" + if attempted_id not in self.node.defined_variables: + genned_ids.append(attempted_id) + i = i + 1 + return genned_ids diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 39cfed588f..7f6f5f1cc9 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -336,8 +336,7 @@ def reset_index(self, drop: bool = True) -> Block: self.session._default_index_type == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 ): - new_index_col_id = guid.generate_guid() - expr = expr.promote_offsets(new_index_col_id) + expr, new_index_col_id = expr.promote_offsets() new_index_cols = [new_index_col_id] elif self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL: new_index_cols = [] @@ -846,9 +845,7 @@ def project_expr( """ Apply a scalar expression to the block. Creates a new column to store the result. """ - # TODO(tbergeron): handle labels safely so callers don't need to - result_id = guid.generate_guid() - array_val = self._expr.project_to_id(expr, result_id) + array_val, result_id = self._expr.project_to_id(expr) block = Block( array_val, index_columns=self.index_columns, @@ -971,12 +968,10 @@ def apply_window_op( for key in window_spec.grouping_keys: block, not_null_id = block.apply_unary_op(key, ops.notnull_op) block = block.filter_by_id(not_null_id).drop_columns([not_null_id]) - result_id = guid.generate_guid() - expr = block._expr.project_window_op( + expr, result_id = block._expr.project_window_op( column, op, window_spec, - result_id, skip_reproject_unsafe=skip_reproject_unsafe, never_skip_nulls=never_skip_nulls, ) @@ -1005,8 +1000,7 @@ def create_constant( label: Label = None, dtype: typing.Optional[bigframes.dtypes.Dtype] = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self.expr.create_constant(result_id, scalar_constant, dtype=dtype) + expr, result_id = self.expr.create_constant(scalar_constant, dtype=dtype) # Create index copy with label inserted # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html labels = self.column_labels.insert(len(self.column_labels), label) @@ -1063,10 +1057,9 @@ def aggregate_all_and_stack( ) for col_id in self.value_columns ] - index_id = guid.generate_guid() - result_expr = self.expr.aggregate( + result_expr, index_id = self.expr.aggregate( aggregations, dropna=dropna - ).create_constant(index_id, None, None) + ).create_constant(None, None) # Transpose as last operation so that final block has valid transpose cache return Block( result_expr, @@ -1077,8 +1070,7 @@ def aggregate_all_and_stack( else: # axis_n == 1 # using offsets as identity to group on. # TODO: Allow to promote identity/total_order columns instead for better perf - offset_col = guid.generate_guid() - expr_with_offsets = self.expr.promote_offsets(offset_col) + expr_with_offsets, offset_col = self.expr.promote_offsets() stacked_expr, (_, value_col_ids, passthrough_cols,) = unpivot( expr_with_offsets, row_labels=self.column_labels, @@ -1224,8 +1216,7 @@ def aggregate( names: typing.List[Label] = [] if len(by_column_ids) == 0: - label_id = guid.generate_guid() - result_expr = result_expr.create_constant(label_id, 0, pd.Int64Dtype()) + result_expr, label_id = result_expr.create_constant(0, pd.Int64Dtype()) index_columns = (label_id,) names = [None] else: @@ -1275,8 +1266,7 @@ def get_stat( for stat in stats_to_fetch ] expr = self.expr.aggregate(aggregations) - offset_index_id = guid.generate_guid() - expr = expr.promote_offsets(offset_index_id) + expr, offset_index_id = expr.promote_offsets() block = Block( expr, index_columns=[offset_index_id], @@ -1303,8 +1293,7 @@ def get_binary_stat( ) ] expr = self.expr.aggregate(aggregations) - offset_index_id = guid.generate_guid() - expr = expr.promote_offsets(offset_index_id) + expr, offset_index_id = expr.promote_offsets() block = Block( expr, index_columns=[offset_index_id], @@ -1406,9 +1395,10 @@ def explode( expr = self.expr.explode(column_ids) if ignore_index: - new_index_ids = guid.generate_guid() + expr = expr.drop_columns(self.index_columns) + expr, new_index_ids = expr.promote_offsets() return Block( - expr.drop_columns(self.index_columns).promote_offsets(new_index_ids), + expr, column_labels=self.column_labels, # Initiates default index creation using the block constructor. index_columns=[new_index_ids], @@ -1593,8 +1583,7 @@ def retrieve_repr_request_results( return computed_df, count, query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.promote_offsets(result_id) + expr, result_id = self._expr.promote_offsets() return ( Block( expr, @@ -1611,13 +1600,11 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: expr = self._expr new_index_cols = [] for index_col in self._index_columns: - new_col = guid.generate_guid() - expr = expr.project_to_id( + expr, new_col = expr.project_to_id( expression=ops.add_op.as_expr( ex.const(prefix), ops.AsTypeOp(to_type="string").as_expr(index_col), ), - output_id=new_col, ) new_index_cols.append(new_col) expr = expr.select_columns((*new_index_cols, *self.value_columns)) @@ -1637,13 +1624,11 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: expr = self._expr new_index_cols = [] for index_col in self._index_columns: - new_col = guid.generate_guid() - expr = expr.project_to_id( + expr, new_col = expr.project_to_id( expression=ops.add_op.as_expr( ops.AsTypeOp(to_type="string").as_expr(index_col), ex.const(suffix), ), - output_id=new_col, ) new_index_cols.append(new_col) expr = expr.select_columns((*new_index_cols, *self.value_columns)) @@ -1785,8 +1770,7 @@ def melt( ) if create_offsets_index: - index_id = guid.generate_guid() - unpivot_expr = unpivot_expr.promote_offsets(index_id) + unpivot_expr, index_id = unpivot_expr.promote_offsets() index_cols = [index_id] else: index_cols = [] @@ -2012,12 +1996,10 @@ def merge( coalesced_ids = [] for left_id, right_id in zip(left_join_ids, right_join_ids): - coalesced_id = guid.generate_guid() - joined_expr = joined_expr.project_to_id( + joined_expr, coalesced_id = joined_expr.project_to_id( ops.coalesce_op.as_expr( get_column_left[left_id], get_column_right[right_id] ), - coalesced_id, ) coalesced_ids.append(coalesced_id) @@ -2076,8 +2058,7 @@ def merge( expr = joined_expr index_columns = [] else: - offset_index_id = guid.generate_guid() - expr = joined_expr.promote_offsets(offset_index_id) + expr, offset_index_id = joined_expr.promote_offsets() index_columns = [offset_index_id] return Block(expr, index_columns=index_columns, column_labels=labels) @@ -2442,8 +2423,7 @@ def _get_rows_as_json_values(self) -> Block: # expression. # TODO(shobs): Replace direct SQL manipulation by structured expression # manipulation - ordering_column_name = guid.generate_guid() - expr = self.expr.promote_offsets(ordering_column_name) + expr, ordering_column_name = self.expr.promote_offsets() expr_sql = self.session._to_sql(expr) # Names of the columns to serialize for the row. @@ -2869,8 +2849,8 @@ def coalesce_columns( expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = guid.generate_guid() - expr = expr.project_to_id( - ops.coalesce_op.as_expr(left_id, right_id), coalesced_id + expr, coalesced_id = expr.project_to_id( + ops.coalesce_op.as_expr(left_id, right_id) ) expr = expr.drop_columns([left_id, right_id]) result_ids.append(coalesced_id) @@ -3047,7 +3027,7 @@ def unpivot( explode_offsets_id = labels_mapping[labels_array.column_ids[-1]] # Build the output rows as a case statment that selects between the N input columns - unpivot_exprs: List[Tuple[ex.Expression, str]] = [] + unpivot_exprs: List[ex.Expression] = [] # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index for input_ids in unpivot_columns: # row explode offset used to choose the input column @@ -3064,11 +3044,11 @@ def unpivot( ) ) col_expr = ops.case_when_op.as_expr(*cases) - unpivot_exprs.append((col_expr, guid.generate_guid())) + unpivot_exprs.append(col_expr) - unpivot_col_ids = [id for _, id in unpivot_exprs] + joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs) - return joined_array.compute_values(unpivot_exprs).select_columns( + return joined_array.select_columns( [*index_col_ids, *unpivot_col_ids, *new_passthrough_cols] ), (tuple(index_col_ids), tuple(unpivot_col_ids), tuple(new_passthrough_cols)) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cd00c98381..693d93de8c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -814,7 +814,7 @@ def project_window_op( column_name: str, op: agg_ops.UnaryWindowOp, window_spec: WindowSpec, - output_name=None, + output_name: str, *, never_skip_nulls=False, ) -> OrderedIR: @@ -823,7 +823,7 @@ def project_window_op( column_name: the id of the input column present in the expression op: the windowable operator to apply to the input column window_spec: a specification of the window over which to apply the operator - output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + output_name: the id to assign to the output of the operator never_skip_nulls: will disable null skipping for operators that would otherwise do so """ # Cannot nest analytic expressions, so reproject to cte first if needed. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 950c2c2cc7..23501f93c8 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -110,7 +110,7 @@ def compile_cached_table(self, node: nodes.CachedTableNode, ordered: bool = True ) used_columns = ( *node.schema.names, - *node.hidden_columns, + *node._hidden_columns, ) # Physical schema might include unused columns, unsupported datatypes like JSON physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( @@ -127,18 +127,20 @@ def compile_cached_table(self, node: nodes.CachedTableNode, ordered: bool = True raise ValueError( "Cannot use partially ordered cached value. Result requires total ordering information." ) - return compiled.OrderedIR( + ir = compiled.OrderedIR( ibis_table, columns=tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( ibis_table[col] ) - for col in node.schema.names + for col in [*node.schema.names, *node._hidden_columns] ), ordering=node.ordering, - hidden_ordering_columns=[ibis_table[c] for c in node.hidden_columns], ) - + ir = ir._select( + tuple(ir._get_ibis_column(name) for name in node.schema.names) + ) + return ir else: return compiled.UnorderedIR( ibis_table, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index e11fcafff7..2dc9623d89 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -178,6 +178,25 @@ def transform_children( """Apply a function to each child node.""" ... + @property + def defines_namespace(self) -> bool: + """ + If true, this node establishes a new column id namespace. + + If false, this node consumes and produces ids in the namespace + """ + return False + + @functools.cached_property + def defined_variables(self) -> set[str]: + """Full set of variables defined in the namespace, even if not selected.""" + self_defined_variables = set(self.schema.names) + if self.defines_namespace: + return self_defined_variables + return self_defined_variables.union( + *(child.defined_variables for child in self.child_nodes) + ) + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -262,6 +281,10 @@ def transform_children( self, left_child=t(self.left_child), right_child=t(self.right_child) ) + @property + def defines_namespace(self) -> bool: + return True + @dataclass(frozen=True) class ConcatNode(BigFrameNode): @@ -477,9 +500,9 @@ def __post_init__(self): raise ValueError( f"Requested schema {logical_names} cannot be derived from table schema {self.table.physical_schema}" ) - if not set(self.hidden_columns).issubset(physical_names): + if not set(self._hidden_columns).issubset(physical_names): raise ValueError( - f"Requested hidden columns {self.hidden_columns} cannot be derived from table schema {self.table.physical_schema}" + f"Requested hidden columns {self._hidden_columns} cannot be derived from table schema {self.table.physical_schema}" ) @property @@ -498,7 +521,7 @@ def variables_introduced(self) -> int: return len(self.schema.items) + OVERHEAD_VARIABLES @property - def hidden_columns(self) -> typing.Tuple[str, ...]: + def _hidden_columns(self) -> typing.Tuple[str, ...]: """Physical columns used to define ordering but not directly exposed as value columns.""" if self.ordering is None: return () @@ -646,6 +669,13 @@ def variables_introduced(self) -> int: # This operation only renames variables, doesn't actually create new ones return 0 + # TODO: Reuse parent namespace + # Currently, Selection node allows renaming an reusing existing names, so it must establish a + # new namespace. + @property + def defines_namespace(self) -> bool: + return True + @dataclass(frozen=True) class ProjectionNode(UnaryNode): @@ -707,6 +737,10 @@ def schema(self) -> schemata.ArraySchema: def variables_introduced(self) -> int: return 1 + @property + def defines_namespace(self) -> bool: + return True + @dataclass(frozen=True) class AggregateNode(UnaryNode): @@ -752,13 +786,17 @@ def order_ambiguous(self) -> bool: def explicitly_ordered(self) -> bool: return True + @property + def defines_namespace(self) -> bool: + return True + @dataclass(frozen=True) class WindowOpNode(UnaryNode): column_name: str op: agg_ops.UnaryWindowOp window_spec: window.WindowSpec - output_name: typing.Optional[str] = None + output_name: str never_skip_nulls: bool = False skip_reproject_unsafe: bool = False @@ -773,10 +811,6 @@ def non_local(self) -> bool: def schema(self) -> schemata.ArraySchema: input_type = self.child.schema.get_type(self.column_name) new_item_dtype = self.op.output_type(input_type) - if self.output_name is None: - return self.child.schema.update_dtype(self.column_name, new_item_dtype) - if self.output_name in self.child.schema.names: - return self.child.schema.update_dtype(self.output_name, new_item_dtype) return self.child.schema.append( schemata.SchemaItem(self.output_name, new_item_dtype) ) @@ -860,3 +894,7 @@ def relation_ops_created(self) -> int: @functools.cached_property def variables_introduced(self) -> int: return len(self.column_ids) + 1 + + @property + def defines_namespace(self) -> bool: + return True diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c2fb9336f3..862c8dc2c8 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3457,7 +3457,8 @@ def _prepare_export( } if ordering_id is not None: - array_value = array_value.promote_offsets(ordering_id) + array_value, internal_ordering_id = array_value.promote_offsets() + id_overrides[internal_ordering_id] = ordering_id return array_value, id_overrides def map(self, func, na_action: Optional[str] = None) -> DataFrame: diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 0512fa78e3..d2a2e0f1b2 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -84,7 +84,9 @@ def to_sql( Convert an ArrayValue to a sql query that will yield its value. """ if offset_column: - array_value = array_value.promote_offsets(offset_column) + array_value, internal_offset_col = array_value.promote_offsets() + col_id_overrides = dict(col_id_overrides) + col_id_overrides[internal_offset_col] = offset_column node = ( self._get_optimized_plan(array_value.node) if enable_cache @@ -364,12 +366,12 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): "Caching with offsets only supported in strictly ordered mode." ) offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") - node_w_offsets = array_value.promote_offsets(offset_column).node - sql = self.compiler.compile_unordered(self._get_optimized_plan(node_w_offsets)) + w_offsets, offset_column = array_value.promote_offsets() + sql = self.compiler.compile_unordered(self._get_optimized_plan(w_offsets.node)) tmp_table = self._sql_as_cached_temp_table( sql, - node_w_offsets.schema.to_bigquery(), + w_offsets.schema.to_bigquery(), cluster_cols=[offset_column], ) cached_replacement = array_value.as_cached( diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index ce9874e35f..22de367804 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -383,8 +383,7 @@ def read_gbq_table( index_names: Sequence[Hashable] = index_cols if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: - sequential_index_col = bigframes.core.guid.generate_guid("index_") - array_value = array_value.promote_offsets(sequential_index_col) + array_value, sequential_index_col = array_value.promote_offsets() index_cols = [sequential_index_col] index_names = [None] diff --git a/tests/unit/test_planner.py b/tests/unit/test_planner.py index 84dd05ddaa..27ec7d5921 100644 --- a/tests/unit/test_planner.py +++ b/tests/unit/test_planner.py @@ -46,10 +46,9 @@ def test_session_aware_caching_project_filter(): """ Test that if a node is filtered by a column, the node is cached pre-filter and clustered by the filter column. """ - session_objects = [LEAF, LEAF.create_constant("col_c", 4, pd.Int64Dtype())] - target = LEAF.create_constant("col_c", 4, pd.Int64Dtype()).filter( - ops.gt_op.as_expr("col_a", ex.const(3)) - ) + session_objects = [LEAF, LEAF.create_constant(4, pd.Int64Dtype())[0]] + target, _ = LEAF.create_constant(4, pd.Int64Dtype()) + target = target.filter(ops.gt_op.as_expr("col_a", ex.const(3))) result, cluster_cols = planner.session_aware_cache_plan( target.node, [obj.node for obj in session_objects] ) @@ -61,14 +60,16 @@ def test_session_aware_caching_project_multi_filter(): """ Test that if a node is filtered by multiple columns, all of them are in the cluster cols """ - session_objects = [LEAF, LEAF.create_constant("col_c", 4, pd.Int64Dtype())] + obj1 = LEAF + obj2, _ = LEAF.create_constant(4, pd.Int64Dtype()) + session_objects = [obj1, obj2] predicate_1a = ops.gt_op.as_expr("col_a", ex.const(3)) predicate_1b = ops.lt_op.as_expr("col_a", ex.const(55)) predicate_1 = ops.and_op.as_expr(predicate_1a, predicate_1b) predicate_3 = ops.eq_op.as_expr("col_b", ex.const(1)) target = ( LEAF.filter(predicate_1) - .create_constant("col_c", 4, pd.Int64Dtype()) + .create_constant(4, pd.Int64Dtype())[0] .filter(predicate_3) ) result, cluster_cols = planner.session_aware_cache_plan( @@ -84,8 +85,8 @@ def test_session_aware_caching_unusable_filter(): Most filters with multiple column references cannot be used for scan pruning, as they cannot be converted to fixed value ranges. """ - session_objects = [LEAF, LEAF.create_constant("col_c", 4, pd.Int64Dtype())] - target = LEAF.create_constant("col_c", 4, pd.Int64Dtype()).filter( + session_objects = [LEAF, LEAF.create_constant(4, pd.Int64Dtype())[0]] + target = LEAF.create_constant(4, pd.Int64Dtype())[0].filter( ops.gt_op.as_expr("col_a", "col_b") ) result, cluster_cols = planner.session_aware_cache_plan( @@ -101,12 +102,10 @@ def test_session_aware_caching_fork_after_window_op(): Windowing is expensive, so caching should always compute the window function, in order to avoid later recomputation. """ - other = LEAF.promote_offsets("offsets_col").create_constant( - "col_d", 5, pd.Int64Dtype() - ) + other = LEAF.promote_offsets()[0].create_constant(5, pd.Int64Dtype())[0] target = ( - LEAF.promote_offsets("offsets_col") - .create_constant("col_c", 4, pd.Int64Dtype()) + LEAF.promote_offsets()[0] + .create_constant(4, pd.Int64Dtype())[0] .filter( ops.eq_op.as_expr("col_a", ops.add_op.as_expr(ex.const(4), ex.const(3))) ) @@ -117,5 +116,5 @@ def test_session_aware_caching_fork_after_window_op(): other.node, ], ) - assert result == LEAF.promote_offsets("offsets_col").node + assert result == LEAF.promote_offsets()[0].node assert cluster_cols == ["col_a"] From c164e85cd3a526908592eb681b43ec04bd4fb47b Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:58:55 +0000 Subject: [PATCH 13/18] build(python): release script update (#993) Source-Link: https://github.com/googleapis/synthtool/commit/71a72973dddbc66ea64073b53eda49f0d22e0942 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:e8dcfd7cbfd8beac3a3ff8d3f3185287ea0625d859168cc80faccfc9a7a00455 Co-authored-by: Owl Bot --- .github/.OwlBot.lock.yaml | 4 ++-- .kokoro/release.sh | 2 +- .kokoro/release/common.cfg | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 6d064ddb9b..597e0c3261 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:94bb690db96e6242b2567a4860a94d48fa48696d092e51b0884a1a2c0a79a407 -# created: 2024-07-31T14:52:44.926548819Z + digest: sha256:e8dcfd7cbfd8beac3a3ff8d3f3185287ea0625d859168cc80faccfc9a7a00455 +# created: 2024-09-16T21:04:09.091105552Z diff --git a/.kokoro/release.sh b/.kokoro/release.sh index 21a9b558c5..b1dd5f09ec 100755 --- a/.kokoro/release.sh +++ b/.kokoro/release.sh @@ -23,7 +23,7 @@ python3 -m releasetool publish-reporter-script > /tmp/publisher-script; source / export PYTHONUNBUFFERED=1 # Move into the package, build the distribution and upload. -TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-1") +TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-2") cd github/python-bigquery-dataframes python3 setup.py sdist bdist_wheel twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg index a0c39946cf..824d62f257 100644 --- a/.kokoro/release/common.cfg +++ b/.kokoro/release/common.cfg @@ -28,7 +28,7 @@ before_action { fetch_keystore { keystore_resource { keystore_config_id: 73713 - keyname: "google-cloud-pypi-token-keystore-1" + keyname: "google-cloud-pypi-token-keystore-2" } } } From 06c1b3396d77d1de4f927328bae70cd7b3eb0b0b Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 17 Sep 2024 08:39:28 -0700 Subject: [PATCH 14/18] deps: limit ibis-framework version to 9.2.0 (#989) * chore: limit ibis version to 9.2.0 * remove ibis constrains on 3.12 tests --- setup.py | 2 +- testing/constraints-3.12.txt | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 908a3c244f..833d4fe565 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=9.0.0,<=9.3.0", + "ibis-framework[bigquery] >=9.0.0,<=9.2.0", "jellyfish >=0.8.9", "numpy >=1.24.0", "pandas >=1.5.3", diff --git a/testing/constraints-3.12.txt b/testing/constraints-3.12.txt index dbbb5a2d88..e69de29bb2 100644 --- a/testing/constraints-3.12.txt +++ b/testing/constraints-3.12.txt @@ -1,3 +0,0 @@ -# Some internal modules have moved, -# so make sure we test on all ibis-framework 9.x versions. -ibis-framework==9.2.0 From 22b483a9d9bc50b594ef270a691e2ea3a31aa295 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 17 Sep 2024 12:29:17 -0700 Subject: [PATCH 15/18] chore: Use fixture for list/str accessor tests (#992) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: Use fixture for list/str accessor tests * fix format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add more type coverage in tests * fix format * remove unnecessary dep * remove import --------- Co-authored-by: Owl Bot --- tests/data/repeated.jsonl | 3 + tests/data/repeated_schema.json | 42 ++++++++++++ tests/system/conftest.py | 27 ++++++++ tests/system/small/operations/test_lists.py | 67 +++++++++++++------ tests/system/small/operations/test_strings.py | 25 ++++--- 5 files changed, 134 insertions(+), 30 deletions(-) create mode 100644 tests/data/repeated.jsonl create mode 100644 tests/data/repeated_schema.json diff --git a/tests/data/repeated.jsonl b/tests/data/repeated.jsonl new file mode 100644 index 0000000000..b3c47772f6 --- /dev/null +++ b/tests/data/repeated.jsonl @@ -0,0 +1,3 @@ +{"rowindex": 0, "int_list_col": [1], "bool_list_col": [true], "float_list_col": [1.2, 2.3], "date_list_col": ["2021-07-21"], "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]} +{"rowindex": 1, "int_list_col": [1,2], "bool_list_col": [true, false], "float_list_col": [1.1], "date_list_col": ["2021-07-21", "1987-03-28"], "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3], "string_list_col": ["a", "bc", "de"]} +{"rowindex": 2, "int_list_col": [1,2,3], "bool_list_col": [true], "float_list_col": [0.5, -1.9, 2.3], "date_list_col": ["2017-08-01", "2004-11-22"], "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7], "string_list_col": ["", "a"]} diff --git a/tests/data/repeated_schema.json b/tests/data/repeated_schema.json new file mode 100644 index 0000000000..300f32c994 --- /dev/null +++ b/tests/data/repeated_schema.json @@ -0,0 +1,42 @@ +[ + { + "name": "rowindex", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "int_list_col", + "type": "INTEGER", + "mode": "REPEATED" + }, + { + "name": "bool_list_col", + "type": "BOOLEAN", + "mode": "REPEATED" + }, + { + "name": "float_list_col", + "type": "FLOAT", + "mode": "REPEATED" + }, + { + "name": "date_list_col", + "type": "DATE", + "mode": "REPEATED" + }, + { + "name": "date_time_list_col", + "type": "DATETIME", + "mode": "REPEATED" + }, + { + "name": "numeric_list_col", + "type": "NUMERIC", + "mode": "REPEATED" + }, + { + "name": "string_list_col", + "type": "STRING", + "mode": "REPEATED" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 49cd887cfd..9cfb9082af 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -39,6 +39,7 @@ import bigframes import bigframes.dataframe import bigframes.pandas as bpd +import bigframes.series import tests.system.utils # Use this to control the number of cloud functions being deleted in a single @@ -294,6 +295,7 @@ def load_test_data_tables( ("scalars", "scalars_schema.json", "scalars.jsonl"), ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("nested", "nested_schema.json", "nested.jsonl"), + ("repeated", "repeated_schema.json", "repeated.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), ("hockey_players", "hockey_players.json", "hockey_players.jsonl"), @@ -370,6 +372,11 @@ def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] +@pytest.fixture(scope="session") +def repeated_table_id(test_data_tables) -> str: + return test_data_tables["repeated"] + + @pytest.fixture(scope="session") def penguins_table_id(test_data_tables) -> str: return test_data_tables["penguins"] @@ -410,6 +417,26 @@ def nested_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def repeated_df( + repeated_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """Returns a DataFrame containing columns of list type.""" + return session.read_gbq(repeated_table_id, index_col="rowindex") + + +@pytest.fixture(scope="session") +def repeated_pandas_df() -> pd.DataFrame: + """Returns a DataFrame containing columns of list type.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + df = df.set_index("rowindex") + return df + + @pytest.fixture(scope="session") def scalars_df_default_index( scalars_df_index: bigframes.dataframe.DataFrame, diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 7ecf79dc6a..7b39bdebd5 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -18,8 +18,6 @@ import pyarrow as pa import pytest -import bigframes.pandas as bpd - from ...utils import assert_series_equal @@ -32,19 +30,34 @@ pytest.param(slice(0, 2, None), id="default_step_slice"), ], ) -def test_getitem(key): +@pytest.mark.parametrize( + ("column_name", "dtype"), + [ + pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))), + pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))), + pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))), + pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))), + pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), + pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), + pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), + ], +) +def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - data = [[1], [2, 3], [4, 5, 6]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - bf_result = s.list[key].to_pandas() - pd_result = pd_s.list[key] + bf_result = repeated_df[column_name].list[key].to_pandas() + pd_result = repeated_pandas_df[column_name].astype(dtype).list[key] - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) @pytest.mark.parametrize( @@ -60,24 +73,36 @@ def test_getitem(key): (slice(0, 2, 2), pytest.raises(NotImplementedError)), ], ) -def test_getitem_notsupported(key, expectation): - data = [[1], [2, 3], [4, 5, 6]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - +def test_getitem_notsupported(key, expectation, repeated_df): with expectation as e: - assert s.list[key] == e + assert repeated_df["int_list_col"].list[key] == e -def test_len(): +@pytest.mark.parametrize( + ("column_name", "dtype"), + [ + pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))), + pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))), + pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))), + pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))), + pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))), + pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))), + pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))), + ], +) +def test_len(column_name, dtype, repeated_df, repeated_pandas_df): if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): pytest.skip( "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" ) - data = [[], [1], [1, 2], [1, 2, 3]] - s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - bf_result = s.list.len().to_pandas() - pd_result = pd_s.list.len() + bf_result = repeated_df[column_name].list.len().to_pandas() + pd_result = repeated_pandas_df[column_name].astype(dtype).list.len() - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + assert_series_equal( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + check_names=False, + ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 15e8512317..98fecaa93b 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -615,21 +615,28 @@ def test_getitem_w_string(scalars_dfs, index): @pytest.mark.parametrize( ("index"), [ - pytest.param(2, id="int"), + pytest.param(0, id="int"), pytest.param(slice(None, None, None), id="default_start_slice"), pytest.param(slice(0, None, 1), id="default_stop_slice"), pytest.param(slice(0, 2, None), id="default_step_slice"), pytest.param(slice(0, 0, None), id="single_one_slice"), ], ) -def test_getitem_w_array(index): - data = [[1], [2, 3], [], [4, 5, 6]] - s = bpd.Series(data) - pd_s = pd.Series(data) - - bf_result = s.str[index].to_pandas() - pd_result = pd_s.str[index] - # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. +@pytest.mark.parametrize( + "column_name", + [ + pytest.param("int_list_col"), + pytest.param("bool_list_col"), + pytest.param("float_list_col"), + pytest.param("string_list_col"), + # date, date_time and numeric are excluded because their default types are different + # in Pandas and BigFrames + ], +) +def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df): + bf_result = repeated_df[column_name].str[index].to_pandas() + pd_result = repeated_pandas_df[column_name].str[index] + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) From ad8263dd0772d87106b401703cd32897ab01f187 Mon Sep 17 00:00:00 2001 From: Luis Velasco <57447519+velascoluis@users.noreply.github.com> Date: Tue, 17 Sep 2024 22:15:34 +0200 Subject: [PATCH 16/18] chore: fix tpch q5 (#984) --- tests/benchmark/tpch/q5.py | 2 +- third_party/bigframes_vendored/tpch/queries/q5.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/tpch/q5.py b/tests/benchmark/tpch/q5.py index e4b3cb0f51..bc39b76e82 100644 --- a/tests/benchmark/tpch/q5.py +++ b/tests/benchmark/tpch/q5.py @@ -14,7 +14,7 @@ import pathlib import benchmark.utils as utils -import bigframes_vendored.tpch.queries.q1 as vendored_tpch_q5 +import bigframes_vendored.tpch.queries.q5 as vendored_tpch_q5 if __name__ == "__main__": dataset_id, session, suffix = utils.get_tpch_configuration() diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py index 20bd8d9c44..b3a9b5e614 100644 --- a/third_party/bigframes_vendored/tpch/queries/q5.py +++ b/third_party/bigframes_vendored/tpch/queries/q5.py @@ -23,7 +23,7 @@ def q(dataset_id: str, session: bigframes.Session): index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERES", + f"bigframes-dev-perf.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( From 7dbda8461da84bdfa2c15049235b7f925129e5e6 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:08:04 -0700 Subject: [PATCH 17/18] chore: update local benchmark, 10t config and kokoro project. (#995) * chore: update local benchmark, 10t config and kokoro project. * update details * fixes * update config * update config reading * update config * remove notebook for now. --- .kokoro/load/benchmark.cfg | 2 +- noxfile.py | 56 ++++++- scripts/run_and_publish_benchmark.py | 138 ++++++++++++------ tests/benchmark/tpch/config.jsonl | 18 ++- tests/benchmark/tpch/q1.py | 4 +- tests/benchmark/tpch/q10.py | 4 +- tests/benchmark/tpch/q11.py | 4 +- tests/benchmark/tpch/q12.py | 4 +- tests/benchmark/tpch/q13.py | 4 +- tests/benchmark/tpch/q14.py | 4 +- tests/benchmark/tpch/q15.py | 4 +- tests/benchmark/tpch/q16.py | 4 +- tests/benchmark/tpch/q17.py | 4 +- tests/benchmark/tpch/q18.py | 4 +- tests/benchmark/tpch/q19.py | 4 +- tests/benchmark/tpch/q2.py | 4 +- tests/benchmark/tpch/q20.py | 4 +- tests/benchmark/tpch/q21.py | 4 +- tests/benchmark/tpch/q22.py | 4 +- tests/benchmark/tpch/q3.py | 4 +- tests/benchmark/tpch/q4.py | 4 +- tests/benchmark/tpch/q5.py | 4 +- tests/benchmark/tpch/q6.py | 4 +- tests/benchmark/tpch/q7.py | 4 +- tests/benchmark/tpch/q8.py | 4 +- tests/benchmark/tpch/q9.py | 4 +- tests/benchmark/utils.py | 10 +- .../bigframes_vendored/tpch/queries/q1.py | 4 +- .../bigframes_vendored/tpch/queries/q10.py | 10 +- .../bigframes_vendored/tpch/queries/q11.py | 8 +- .../bigframes_vendored/tpch/queries/q12.py | 6 +- .../bigframes_vendored/tpch/queries/q13.py | 6 +- .../bigframes_vendored/tpch/queries/q14.py | 6 +- .../bigframes_vendored/tpch/queries/q15.py | 6 +- .../bigframes_vendored/tpch/queries/q16.py | 8 +- .../bigframes_vendored/tpch/queries/q17.py | 6 +- .../bigframes_vendored/tpch/queries/q18.py | 8 +- .../bigframes_vendored/tpch/queries/q19.py | 6 +- .../bigframes_vendored/tpch/queries/q2.py | 12 +- .../bigframes_vendored/tpch/queries/q20.py | 12 +- .../bigframes_vendored/tpch/queries/q21.py | 10 +- .../bigframes_vendored/tpch/queries/q22.py | 6 +- .../bigframes_vendored/tpch/queries/q3.py | 8 +- .../bigframes_vendored/tpch/queries/q4.py | 6 +- .../bigframes_vendored/tpch/queries/q5.py | 14 +- .../bigframes_vendored/tpch/queries/q6.py | 4 +- .../bigframes_vendored/tpch/queries/q7.py | 12 +- .../bigframes_vendored/tpch/queries/q8.py | 16 +- .../bigframes_vendored/tpch/queries/q9.py | 14 +- 49 files changed, 301 insertions(+), 199 deletions(-) diff --git a/.kokoro/load/benchmark.cfg b/.kokoro/load/benchmark.cfg index bc2d7a2655..45e2527a0c 100644 --- a/.kokoro/load/benchmark.cfg +++ b/.kokoro/load/benchmark.cfg @@ -13,7 +13,7 @@ env_vars: { env_vars: { key: "GOOGLE_CLOUD_PROJECT" - value: "bigframes-load-testing" + value: "bigframes-benchmarking" } env_vars: { diff --git a/noxfile.py b/noxfile.py index f4142af21b..bf4c6f9641 100644 --- a/noxfile.py +++ b/noxfile.py @@ -16,6 +16,7 @@ from __future__ import absolute_import +import argparse import multiprocessing import os import pathlib @@ -804,7 +805,7 @@ def notebook(session: nox.Session): processes = [] for notebook, regions in notebooks_reg.items(): for region in regions: - args = ( + region_args = ( "python", "scripts/run_and_publish_benchmark.py", "--notebook", @@ -814,7 +815,7 @@ def notebook(session: nox.Session): if multi_process_mode: process = multiprocessing.Process( target=_run_process, - args=(session, args, error_flag), + args=(session, region_args, error_flag), ) process.start() processes.append(process) @@ -822,7 +823,7 @@ def notebook(session: nox.Session): # process to avoid potential race conditions。 time.sleep(1) else: - session.run(*args) + session.run(*region_args) for process in processes: process.join() @@ -861,7 +862,51 @@ def benchmark(session: nox.Session): session.install("-e", ".[all]") base_path = os.path.join("tests", "benchmark") - benchmark_script_list = list(pathlib.Path(base_path).rglob("*.py")) + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--iterations", + type=int, + default=1, + help="Number of iterations to run each benchmark.", + ) + parser.add_argument( + "-o", + "--output-csv", + nargs="?", + const=True, + default=False, + help=( + "Determines whether to output results to a CSV file. If no location is provided, " + "a temporary location is automatically generated." + ), + ) + parser.add_argument( + "-b", + "--benchmark-filter", + nargs="+", + help=( + "List of file or directory names to include in the benchmarks. If not provided, " + "all benchmarks are run." + ), + ) + + args = parser.parse_args(session.posargs) + + benchmark_script_list: List[pathlib.Path] = [] + if args.benchmark_filter: + for filter_item in args.benchmark_filter: + full_path = os.path.join(base_path, filter_item) + if os.path.isdir(full_path): + benchmark_script_list.extend(pathlib.Path(full_path).rglob("*.py")) + elif os.path.isfile(full_path) and full_path.endswith(".py"): + benchmark_script_list.append(pathlib.Path(full_path)) + else: + raise ValueError( + f"Item {filter_item} does not match any valid file or directory" + ) + else: + benchmark_script_list = list(pathlib.Path(base_path).rglob("*.py")) try: for benchmark in benchmark_script_list: @@ -871,12 +916,15 @@ def benchmark(session: nox.Session): "python", "scripts/run_and_publish_benchmark.py", f"--benchmark-path={benchmark}", + f"--iterations={args.iterations}", ) finally: session.run( "python", "scripts/run_and_publish_benchmark.py", f"--publish-benchmarks={base_path}", + f"--iterations={args.iterations}", + f"--output-csv={args.output_csv}", ) diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py index 4b3d4f4d96..a42301cb13 100644 --- a/scripts/run_and_publish_benchmark.py +++ b/scripts/run_and_publish_benchmark.py @@ -19,6 +19,7 @@ import pathlib import subprocess import sys +import tempfile from typing import Dict, List, Union import numpy as np @@ -50,7 +51,7 @@ def run_benchmark_subprocess(args, log_env_name_var, filename=None, region=None) subprocess.run(args, env=env, check=True) -def collect_benchmark_result(benchmark_path: str) -> pd.DataFrame: +def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFrame: """Generate a DataFrame report on HTTP queries, bytes processed, slot time and execution time from log files.""" path = pathlib.Path(benchmark_path) try: @@ -100,28 +101,23 @@ def collect_benchmark_result(benchmark_path: str) -> pd.DataFrame: with open(bytes_file, "r") as file: lines = file.read().splitlines() - query_count = len(lines) - total_bytes = sum(int(line) for line in lines) + query_count = len(lines) / iterations + total_bytes = sum(int(line) for line in lines) / iterations with open(millis_file, "r") as file: lines = file.read().splitlines() - total_slot_millis = sum(int(line) for line in lines) + total_slot_millis = sum(int(line) for line in lines) / iterations if has_local_seconds: - # 'local_seconds' captures the total execution time for a benchmark as it - # starts timing immediately before the benchmark code begins and stops - # immediately after it ends. Unlike other metrics that might accumulate - # values proportional to the number of queries executed, 'local_seconds' is - # a singular measure of the time taken for the complete execution of the - # benchmark, from start to finish. with open(local_seconds_file, "r") as file: - local_seconds = float(file.readline().strip()) + lines = file.read().splitlines() + local_seconds = sum(float(line) for line in lines) / iterations else: local_seconds = None with open(bq_seconds_file, "r") as file: lines = file.read().splitlines() - bq_seconds = sum(float(line) for line in lines) + bq_seconds = sum(float(line) for line in lines) / iterations results_dict[str(filename)] = [ query_count, @@ -154,7 +150,12 @@ def collect_benchmark_result(benchmark_path: str) -> pd.DataFrame: columns=columns, ) - print("---BIGQUERY USAGE REPORT---") + report_title = ( + "---BIGQUERY USAGE REPORT---" + if iterations == 1 + else f"---BIGQUERY USAGE REPORT (Averages over {iterations} Iterations)---" + ) + print(report_title) for index, row in benchmark_metrics.iterrows(): formatted_local_exec_time = ( f"{round(row['Local_Execution_Time_Sec'], 1)} seconds" @@ -259,7 +260,26 @@ def find_config(start_path): return None -def run_benchmark_from_config(benchmark: str): +def publish_to_bigquery(dataframe, notebook, project_name="bigframes-metrics"): + bigquery_table = ( + f"{project_name}.benchmark_report.notebook_benchmark" + if notebook + else f"{project_name}.benchmark_report.benchmark" + ) + + repo_status = get_repository_status() + for idx, col in enumerate(repo_status.keys()): + dataframe.insert(idx, col, repo_status[col]) + + pandas_gbq.to_gbq( + dataframe=dataframe, + destination_table=bigquery_table, + if_exists="append", + ) + print(f"Results have been successfully uploaded to {bigquery_table}.") + + +def run_benchmark_from_config(benchmark: str, iterations: int): print(benchmark) config_path = find_config(benchmark) @@ -267,24 +287,26 @@ def run_benchmark_from_config(benchmark: str): benchmark_configs = [] with open(config_path, "r") as f: for line in f: - config = json.loads(line) - python_args = [f"--{key}={value}" for key, value in config.items()] - suffix = ( - config["benchmark_suffix"] - if "benchmark_suffix" in config - else "_".join(f"{key}_{value}" for key, value in config.items()) - ) - benchmark_configs.append((suffix, python_args)) + if line.strip(): + config = json.loads(line) + python_args = [f"--{key}={value}" for key, value in config.items()] + suffix = ( + config["benchmark_suffix"] + if "benchmark_suffix" in config + else "_".join(f"{key}_{value}" for key, value in config.items()) + ) + benchmark_configs.append((suffix, python_args)) else: benchmark_configs = [(None, [])] - for benchmark_config in benchmark_configs: - args = ["python", str(benchmark)] - args.extend(benchmark_config[1]) - log_env_name_var = str(benchmark) - if benchmark_config[0] is not None: - log_env_name_var += f"_{benchmark_config[0]}" - run_benchmark_subprocess(args=args, log_env_name_var=log_env_name_var) + for _ in range(iterations): + for benchmark_config in benchmark_configs: + args = ["python", str(benchmark)] + args.extend(benchmark_config[1]) + log_env_name_var = str(benchmark) + if benchmark_config[0] is not None: + log_env_name_var += f"_{benchmark_config[0]}" + run_benchmark_subprocess(args=args, log_env_name_var=log_env_name_var) def run_notebook_benchmark(benchmark_file: str, region: str): @@ -341,6 +363,19 @@ def parse_arguments(): help="Set the benchmarks to be published to BigQuery.", ) + parser.add_argument( + "--iterations", + type=int, + default=1, + help="Number of iterations to run each benchmark.", + ) + parser.add_argument( + "--output-csv", + type=str, + default=None, + help="Determines whether to output results to a CSV file. If no location is provided, a temporary location is automatically generated.", + ) + return parser.parse_args() @@ -348,28 +383,39 @@ def main(): args = parse_arguments() if args.publish_benchmarks: - bigquery_table = ( - "bigframes-metrics.benchmark_report.notebook_benchmark" - if args.notebook - else "bigframes-metrics.benchmark_report.benchmark" + benchmark_metrics = collect_benchmark_result( + args.publish_benchmarks, args.iterations ) - benchmark_metrics = collect_benchmark_result(args.publish_benchmarks) - - if os.getenv("BENCHMARK_AND_PUBLISH", "false") == "true": - repo_status = get_repository_status() - for idx, col in enumerate(repo_status.keys()): - benchmark_metrics.insert(idx, col, repo_status[col]) - - pandas_gbq.to_gbq( - dataframe=benchmark_metrics, - destination_table=bigquery_table, - if_exists="append", + # Output results to CSV without specifying a location + if args.output_csv == "True": + current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + temp_file = tempfile.NamedTemporaryFile( + prefix=f"benchmark_{current_time}_", delete=False, suffix=".csv" ) - print("Results have been successfully uploaded to BigQuery.") + benchmark_metrics.to_csv(temp_file.name, index=False) + print( + f"Benchmark result is saved to a temporary location: {temp_file.name}" + ) + temp_file.close() + # Output results to CSV with specified a custom location + elif args.output_csv != "False": + benchmark_metrics.to_csv(args.output_csv, index=False) + print(f"Benchmark result is saved to: {args.output_csv}") + + # Publish the benchmark metrics to BigQuery under the 'bigframes-metrics' project. + # The 'BENCHMARK_AND_PUBLISH' environment variable should be set to 'true' only + # in specific Kokoro sessions. + if os.getenv("BENCHMARK_AND_PUBLISH", "false") == "true": + publish_to_bigquery(benchmark_metrics, args.notebook) + # If the 'GCLOUD_BENCH_PUBLISH_PROJECT' environment variable is set, publish the + # benchmark metrics to a specified BigQuery table in the provided project. This is + # intended for local testing where the default behavior is not to publish results. + elif project := os.getenv("GCLOUD_BENCH_PUBLISH_PROJECT", ""): + publish_to_bigquery(benchmark_metrics, args.notebook, project) elif args.notebook: run_notebook_benchmark(args.benchmark_path, args.region) else: - run_benchmark_from_config(args.benchmark_path) + run_benchmark_from_config(args.benchmark_path, args.iterations) if __name__ == "__main__": diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl index 1c8b4dd19a..e6f7a444f6 100644 --- a/tests/benchmark/tpch/config.jsonl +++ b/tests/benchmark/tpch/config.jsonl @@ -1,8 +1,10 @@ -{"benchmark_suffix": "1g_ordered", "dataset_id": "tpch_0001g", "ordered": true} -{"benchmark_suffix": "1g_unordered", "dataset_id": "tpch_0001g", "ordered": false} -{"benchmark_suffix": "10g_ordered", "dataset_id": "tpch_0010g", "ordered": true} -{"benchmark_suffix": "10g_unordered", "dataset_id": "tpch_0010g", "ordered": false} -{"benchmark_suffix": "100g_ordered", "dataset_id": "tpch_0100g", "ordered": true} -{"benchmark_suffix": "100g_unordered", "dataset_id": "tpch_0100g", "ordered": false} -{"benchmark_suffix": "1t_ordered", "dataset_id": "tpch_0001t", "ordered": true} -{"benchmark_suffix": "1t_unordered", "dataset_id": "tpch_0001t", "ordered": false} +{"benchmark_suffix": "1g_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001g", "ordered": true} +{"benchmark_suffix": "1g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001g", "ordered": false} +{"benchmark_suffix": "10g_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010g", "ordered": true} +{"benchmark_suffix": "10g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010g", "ordered": false} +{"benchmark_suffix": "100g_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0100g", "ordered": true} +{"benchmark_suffix": "100g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0100g", "ordered": false} +{"benchmark_suffix": "1t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": true} +{"benchmark_suffix": "1t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": false} +{"benchmark_suffix": "10t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": true} +{"benchmark_suffix": "10t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": false} diff --git a/tests/benchmark/tpch/q1.py b/tests/benchmark/tpch/q1.py index 398e9bf9b2..3f1c63967e 100644 --- a/tests/benchmark/tpch/q1.py +++ b/tests/benchmark/tpch/q1.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q1 as vendored_tpch_q1 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q1.q, current_path, suffix, dataset_id, session + vendored_tpch_q1.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q10.py b/tests/benchmark/tpch/q10.py index e500a657b6..bea18975ca 100644 --- a/tests/benchmark/tpch/q10.py +++ b/tests/benchmark/tpch/q10.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q10 as vendored_tpch_q10 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q10.q, current_path, suffix, dataset_id, session + vendored_tpch_q10.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q11.py b/tests/benchmark/tpch/q11.py index 46bd2f022c..538d8d3e5f 100644 --- a/tests/benchmark/tpch/q11.py +++ b/tests/benchmark/tpch/q11.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q11 as vendored_tpch_q11 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q11.q, current_path, suffix, dataset_id, session + vendored_tpch_q11.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q12.py b/tests/benchmark/tpch/q12.py index d2fab97699..6503b543f4 100644 --- a/tests/benchmark/tpch/q12.py +++ b/tests/benchmark/tpch/q12.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q12 as vendored_tpch_q12 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q12.q, current_path, suffix, dataset_id, session + vendored_tpch_q12.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q13.py b/tests/benchmark/tpch/q13.py index 24eec7090c..60c2101f6f 100644 --- a/tests/benchmark/tpch/q13.py +++ b/tests/benchmark/tpch/q13.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q13 as vendored_tpch_q13 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q13.q, current_path, suffix, dataset_id, session + vendored_tpch_q13.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q14.py b/tests/benchmark/tpch/q14.py index 8aa7ed4d2e..1698a01628 100644 --- a/tests/benchmark/tpch/q14.py +++ b/tests/benchmark/tpch/q14.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q14 as vendored_tpch_q14 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q14.q, current_path, suffix, dataset_id, session + vendored_tpch_q14.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q15.py b/tests/benchmark/tpch/q15.py index 511cfbc683..49e2ce4e92 100644 --- a/tests/benchmark/tpch/q15.py +++ b/tests/benchmark/tpch/q15.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q15 as vendored_tpch_q15 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q15.q, current_path, suffix, dataset_id, session + vendored_tpch_q15.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q16.py b/tests/benchmark/tpch/q16.py index 1d1f4b5f30..ef6edf6b12 100644 --- a/tests/benchmark/tpch/q16.py +++ b/tests/benchmark/tpch/q16.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q16 as vendored_tpch_q16 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q16.q, current_path, suffix, dataset_id, session + vendored_tpch_q16.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q17.py b/tests/benchmark/tpch/q17.py index e285cc9fca..2f680d206e 100644 --- a/tests/benchmark/tpch/q17.py +++ b/tests/benchmark/tpch/q17.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q17 as vendored_tpch_q17 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q17.q, current_path, suffix, dataset_id, session + vendored_tpch_q17.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q18.py b/tests/benchmark/tpch/q18.py index de9e4f2f94..7336246f1b 100644 --- a/tests/benchmark/tpch/q18.py +++ b/tests/benchmark/tpch/q18.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q18 as vendored_tpch_q18 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q18.q, current_path, suffix, dataset_id, session + vendored_tpch_q18.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q19.py b/tests/benchmark/tpch/q19.py index 7b13b0d250..3bf34794bf 100644 --- a/tests/benchmark/tpch/q19.py +++ b/tests/benchmark/tpch/q19.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q19 as vendored_tpch_q19 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q19.q, current_path, suffix, dataset_id, session + vendored_tpch_q19.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q2.py b/tests/benchmark/tpch/q2.py index 57151e36bd..c738aae124 100644 --- a/tests/benchmark/tpch/q2.py +++ b/tests/benchmark/tpch/q2.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q2 as vendored_tpch_q2 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q2.q, current_path, suffix, dataset_id, session + vendored_tpch_q2.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q20.py b/tests/benchmark/tpch/q20.py index e02e9306f8..1b254c6a78 100644 --- a/tests/benchmark/tpch/q20.py +++ b/tests/benchmark/tpch/q20.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q20 as vendored_tpch_q20 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q20.q, current_path, suffix, dataset_id, session + vendored_tpch_q20.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q21.py b/tests/benchmark/tpch/q21.py index d123286c3e..18e8df87fe 100644 --- a/tests/benchmark/tpch/q21.py +++ b/tests/benchmark/tpch/q21.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q21 as vendored_tpch_q21 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q21.q, current_path, suffix, dataset_id, session + vendored_tpch_q21.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q22.py b/tests/benchmark/tpch/q22.py index 841740da0e..6c10021c2b 100644 --- a/tests/benchmark/tpch/q22.py +++ b/tests/benchmark/tpch/q22.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q22 as vendored_tpch_q22 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q22.q, current_path, suffix, dataset_id, session + vendored_tpch_q22.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q3.py b/tests/benchmark/tpch/q3.py index 28cdfed49c..5bcaaa0d5d 100644 --- a/tests/benchmark/tpch/q3.py +++ b/tests/benchmark/tpch/q3.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q3 as vendored_tpch_q3 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q3.q, current_path, suffix, dataset_id, session + vendored_tpch_q3.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q4.py b/tests/benchmark/tpch/q4.py index aa67cc77a0..462c6336d1 100644 --- a/tests/benchmark/tpch/q4.py +++ b/tests/benchmark/tpch/q4.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q4 as vendored_tpch_q4 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q4.q, current_path, suffix, dataset_id, session + vendored_tpch_q4.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q5.py b/tests/benchmark/tpch/q5.py index bc39b76e82..108cde58cc 100644 --- a/tests/benchmark/tpch/q5.py +++ b/tests/benchmark/tpch/q5.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q5 as vendored_tpch_q5 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q5.q, current_path, suffix, dataset_id, session + vendored_tpch_q5.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q6.py b/tests/benchmark/tpch/q6.py index a193333045..ccefc1b0bf 100644 --- a/tests/benchmark/tpch/q6.py +++ b/tests/benchmark/tpch/q6.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q6 as vendored_tpch_q6 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q6.q, current_path, suffix, dataset_id, session + vendored_tpch_q6.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q7.py b/tests/benchmark/tpch/q7.py index 8a17eb91ea..0cad599a60 100644 --- a/tests/benchmark/tpch/q7.py +++ b/tests/benchmark/tpch/q7.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q7 as vendored_tpch_q7 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q7.q, current_path, suffix, dataset_id, session + vendored_tpch_q7.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q8.py b/tests/benchmark/tpch/q8.py index b5e7d7aa37..6c6ac23b9b 100644 --- a/tests/benchmark/tpch/q8.py +++ b/tests/benchmark/tpch/q8.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q8 as vendored_tpch_q8 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q8.q, current_path, suffix, dataset_id, session + vendored_tpch_q8.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/tpch/q9.py b/tests/benchmark/tpch/q9.py index c098e9a8da..05c82fb66e 100644 --- a/tests/benchmark/tpch/q9.py +++ b/tests/benchmark/tpch/q9.py @@ -17,9 +17,9 @@ import bigframes_vendored.tpch.queries.q9 as vendored_tpch_q9 if __name__ == "__main__": - dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_tpch_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_tpch_q9.q, current_path, suffix, dataset_id, session + vendored_tpch_q9.q, current_path, suffix, project_id, dataset_id, session ) diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index c286d4e229..32be33fc74 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -43,6 +43,12 @@ def get_dbbenchmark_configuration(): def get_tpch_configuration(): parser = argparse.ArgumentParser(description="Process TPC-H Query using BigFrames.") + parser.add_argument( + "--project_id", + type=str, + required=True, + help="The BigQuery dataset ID to query.", + ) parser.add_argument( "--dataset_id", type=str, @@ -62,7 +68,7 @@ def get_tpch_configuration(): args = parser.parse_args() session = _initialize_session(_str_to_bool(args.ordered)) - return args.dataset_id, session, args.benchmark_suffix + return args.project_id, args.dataset_id, session, args.benchmark_suffix def get_execution_time(func, current_path, suffix, *args, **kwargs): @@ -73,7 +79,7 @@ def get_execution_time(func, current_path, suffix, *args, **kwargs): clock_time_file_path = f"{current_path}_{suffix}.local_exec_time_seconds" - with open(clock_time_file_path, "w") as log_file: + with open(clock_time_file_path, "a") as log_file: log_file.write(f"{runtime}\n") diff --git a/third_party/bigframes_vendored/tpch/queries/q1.py b/third_party/bigframes_vendored/tpch/queries/q1.py index 3f155ec1c9..e1fdf85f58 100644 --- a/third_party/bigframes_vendored/tpch/queries/q1.py +++ b/third_party/bigframes_vendored/tpch/queries/q1.py @@ -7,9 +7,9 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q10.py b/third_party/bigframes_vendored/tpch/queries/q10.py index 95b7b87b0e..75a8f2de7f 100644 --- a/third_party/bigframes_vendored/tpch/queries/q10.py +++ b/third_party/bigframes_vendored/tpch/queries/q10.py @@ -7,21 +7,21 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index 269728f6f1..484a7c0001 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -4,17 +4,17 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) partsupp = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + f"{project_id}.{dataset_id}.PARTSUPP", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q12.py b/third_party/bigframes_vendored/tpch/queries/q12.py index 1c39275932..e2b7aaf9f2 100644 --- a/third_party/bigframes_vendored/tpch/queries/q12.py +++ b/third_party/bigframes_vendored/tpch/queries/q12.py @@ -7,13 +7,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q13.py b/third_party/bigframes_vendored/tpch/queries/q13.py index dd69cf509f..ea2f0da284 100644 --- a/third_party/bigframes_vendored/tpch/queries/q13.py +++ b/third_party/bigframes_vendored/tpch/queries/q13.py @@ -6,13 +6,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py index 8c25a5897d..27f3d9e224 100644 --- a/third_party/bigframes_vendored/tpch/queries/q14.py +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -5,13 +5,13 @@ import bigframes -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py index fae3010e36..042adbda8b 100644 --- a/third_party/bigframes_vendored/tpch/queries/q15.py +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -6,13 +6,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py index b5eb62547f..1bd2795c42 100644 --- a/third_party/bigframes_vendored/tpch/queries/q16.py +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -4,17 +4,17 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) partsupp = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + f"{project_id}.{dataset_id}.PARTSUPP", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py index a95d128b5c..0bd1c44315 100644 --- a/third_party/bigframes_vendored/tpch/queries/q17.py +++ b/third_party/bigframes_vendored/tpch/queries/q17.py @@ -4,13 +4,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q18.py b/third_party/bigframes_vendored/tpch/queries/q18.py index dac9f5c438..f645a08681 100644 --- a/third_party/bigframes_vendored/tpch/queries/q18.py +++ b/third_party/bigframes_vendored/tpch/queries/q18.py @@ -6,17 +6,17 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q19.py b/third_party/bigframes_vendored/tpch/queries/q19.py index 526d0aa1a6..1371af53fc 100644 --- a/third_party/bigframes_vendored/tpch/queries/q19.py +++ b/third_party/bigframes_vendored/tpch/queries/q19.py @@ -4,13 +4,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q2.py b/third_party/bigframes_vendored/tpch/queries/q2.py index 0586a5d1a5..f388252993 100644 --- a/third_party/bigframes_vendored/tpch/queries/q2.py +++ b/third_party/bigframes_vendored/tpch/queries/q2.py @@ -3,25 +3,25 @@ import bigframes -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): region = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.REGION", + f"{project_id}.{dataset_id}.REGION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) partsupp = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + f"{project_id}.{dataset_id}.PARTSUPP", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index 671d7e06fb..26651a31c4 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -6,25 +6,25 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) partsupp = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + f"{project_id}.{dataset_id}.PARTSUPP", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q21.py b/third_party/bigframes_vendored/tpch/queries/q21.py index 3a4ea495c9..b844cd6f6b 100644 --- a/third_party/bigframes_vendored/tpch/queries/q21.py +++ b/third_party/bigframes_vendored/tpch/queries/q21.py @@ -6,21 +6,21 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index 97180cd11a..137a7d5c36 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -4,13 +4,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q3.py b/third_party/bigframes_vendored/tpch/queries/q3.py index 94b85f2054..fb09abe159 100644 --- a/third_party/bigframes_vendored/tpch/queries/q3.py +++ b/third_party/bigframes_vendored/tpch/queries/q3.py @@ -5,17 +5,17 @@ import bigframes -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py index 9cc6f73c98..b89f70845f 100644 --- a/third_party/bigframes_vendored/tpch/queries/q4.py +++ b/third_party/bigframes_vendored/tpch/queries/q4.py @@ -8,13 +8,13 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py index b3a9b5e614..9839c025a5 100644 --- a/third_party/bigframes_vendored/tpch/queries/q5.py +++ b/third_party/bigframes_vendored/tpch/queries/q5.py @@ -5,29 +5,29 @@ import bigframes -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): region = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.REGION", + f"{project_id}.{dataset_id}.REGION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q6.py b/third_party/bigframes_vendored/tpch/queries/q6.py index 13341c4f4e..b883837fe2 100644 --- a/third_party/bigframes_vendored/tpch/queries/q6.py +++ b/third_party/bigframes_vendored/tpch/queries/q6.py @@ -5,9 +5,9 @@ import bigframes -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q7.py b/third_party/bigframes_vendored/tpch/queries/q7.py index d922efd1e2..a4dfe3f12e 100644 --- a/third_party/bigframes_vendored/tpch/queries/q7.py +++ b/third_party/bigframes_vendored/tpch/queries/q7.py @@ -8,25 +8,25 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py index 4520fce14a..1676ec6349 100644 --- a/third_party/bigframes_vendored/tpch/queries/q8.py +++ b/third_party/bigframes_vendored/tpch/queries/q8.py @@ -5,33 +5,33 @@ import bigframes -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): customer = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + f"{project_id}.{dataset_id}.CUSTOMER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) region = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.REGION", + f"{project_id}.{dataset_id}.REGION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py index d04117f0f1..c2b52789bd 100644 --- a/third_party/bigframes_vendored/tpch/queries/q9.py +++ b/third_party/bigframes_vendored/tpch/queries/q9.py @@ -6,29 +6,29 @@ import bigframes.pandas as bpd -def q(dataset_id: str, session: bigframes.Session): +def q(project_id: str, dataset_id: str, session: bigframes.Session): lineitem = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.LINEITEM", + f"{project_id}.{dataset_id}.LINEITEM", index_col=bigframes.enums.DefaultIndexKind.NULL, ) nation = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.NATION", + f"{project_id}.{dataset_id}.NATION", index_col=bigframes.enums.DefaultIndexKind.NULL, ) orders = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.ORDERS", + f"{project_id}.{dataset_id}.ORDERS", index_col=bigframes.enums.DefaultIndexKind.NULL, ) part = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PART", + f"{project_id}.{dataset_id}.PART", index_col=bigframes.enums.DefaultIndexKind.NULL, ) partsupp = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + f"{project_id}.{dataset_id}.PARTSUPP", index_col=bigframes.enums.DefaultIndexKind.NULL, ) supplier = session.read_gbq( - f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + f"{project_id}.{dataset_id}.SUPPLIER", index_col=bigframes.enums.DefaultIndexKind.NULL, ) From d2fb49f84d3e27d79d3248227d057fa1f57bb897 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 13:21:14 -0700 Subject: [PATCH 18/18] chore(main): release 1.18.0 (#986) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 21 +++++++++++++++++++++ bigframes/version.py | 2 +- third_party/bigframes_vendored/version.py | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a989d8af66..c210f5c991 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,27 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.18.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.17.0...v1.18.0) (2024-09-18) + + +### Features + +* Add "include" param to describe for string types ([#973](https://github.com/googleapis/python-bigquery-dataframes/issues/973)) ([deac6d2](https://github.com/googleapis/python-bigquery-dataframes/commit/deac6d2d6e459b26c05f6e5ff328ea03a3cff45f)) +* Add `subset` parameter to `DataFrame.dropna` to select which columns to consider ([#981](https://github.com/googleapis/python-bigquery-dataframes/issues/981)) ([f7c03dc](https://github.com/googleapis/python-bigquery-dataframes/commit/f7c03dcaf7ee4d62497f6653851e390795fc60a2)) + + +### Bug Fixes + +* DataFrameGroupby.agg now works with unnamed tuples ([#985](https://github.com/googleapis/python-bigquery-dataframes/issues/985)) ([0f047b4](https://github.com/googleapis/python-bigquery-dataframes/commit/0f047b4fae2a10b2a465c506bea561f8bb8d4262)) +* Fix a bug that raises exception when re-indexing columns with their original order ([#988](https://github.com/googleapis/python-bigquery-dataframes/issues/988)) ([596b03b](https://github.com/googleapis/python-bigquery-dataframes/commit/596b03bb3ea27cead9b90200b9ef3cdcd99ca184)) +* Make the `Series.apply` outcome `assign`able to the original dataframe in partial ordering mode ([#874](https://github.com/googleapis/python-bigquery-dataframes/issues/874)) ([c94ead9](https://github.com/googleapis/python-bigquery-dataframes/commit/c94ead996e3bfa98edd51ff678a3d43a10ee980f)) + + +### Dependencies + +* Limit ibis-framework version to 9.2.0 ([#989](https://github.com/googleapis/python-bigquery-dataframes/issues/989)) ([06c1b33](https://github.com/googleapis/python-bigquery-dataframes/commit/06c1b3396d77d1de4f927328bae70cd7b3eb0b0b)) +* Update to ibis-framework 9.x and newer sqlglot ([#827](https://github.com/googleapis/python-bigquery-dataframes/issues/827)) ([89ea44f](https://github.com/googleapis/python-bigquery-dataframes/commit/89ea44fb66314b134fc0a10d816c1659978d4182)) + ## [1.17.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.16.0...v1.17.0) (2024-09-11) diff --git a/bigframes/version.py b/bigframes/version.py index 2c0c6e4d3a..745632616c 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.17.0" +__version__ = "1.18.0" diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 2c0c6e4d3a..745632616c 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.17.0" +__version__ = "1.18.0"