From 90cdaca6afc792069b33e471f973fc071211f302 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 28 Aug 2025 18:16:41 +0000 Subject: [PATCH 1/4] chore: fix ops.ToTimedeltaOp and ops.IsInOp sqlglot compiler --- .../compile/sqlglot/expressions/unary_compiler.py | 4 +++- tests/system/small/engines/test_generic_ops.py | 2 +- .../test_unary_compiler/test_is_in/empty.sql | 13 +++++++++++++ .../test_unary_compiler/test_to_timedelta/out.sql | 6 +++--- .../sqlglot/expressions/test_unary_compiler.py | 5 ++++- 5 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 98f1603be7..bf8c518770 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -420,6 +420,8 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: @UNARY_OP_REGISTRATION.register(ops.IsInOp) def _(op: ops.IsInOp, expr: TypedExpr) -> sge.Expression: + if op.values is None or len(op.values) == 0: + return sge.convert(False) return sge.In(this=expr.expr, expressions=[sge.convert(v) for v in op.values]) @@ -767,7 +769,7 @@ def _(op: ops.ToTimedeltaOp, expr: TypedExpr) -> sge.Expression: factor = UNIT_TO_US_CONVERSION_FACTORS[op.unit] if factor != 1: value = sge.Mul(this=value, expression=sge.convert(factor)) - return sge.Interval(this=value, unit=sge.Identifier(this="MICROSECOND")) + return value @UNARY_OP_REGISTRATION.register(ops.UnixMicros) diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index 8deef3638e..14c6e9a454 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -392,7 +392,7 @@ def test_engines_invert_op(scalars_array_value: array_value.ArrayValue, engine): assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_isin_op(scalars_array_value: array_value.ArrayValue, engine): arr, col_ids = scalars_array_value.compute_values( [ diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql new file mode 100644 index 0000000000..ece9a459f6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + FALSE AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql index 01ebebc455..057e6c778e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql @@ -8,7 +8,7 @@ WITH `bfcte_0` AS ( *, `bfcol_1` AS `bfcol_4`, `bfcol_0` AS `bfcol_5`, - INTERVAL `bfcol_0` MICROSECOND AS `bfcol_6` + `bfcol_0` AS `bfcol_6` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT @@ -16,7 +16,7 @@ WITH `bfcte_0` AS ( `bfcol_4` AS `bfcol_10`, `bfcol_5` AS `bfcol_11`, `bfcol_6` AS `bfcol_12`, - INTERVAL (`bfcol_5` * 1000000) MICROSECOND AS `bfcol_13` + `bfcol_5` * 1000000 AS `bfcol_13` FROM `bfcte_1` ), `bfcte_3` AS ( SELECT @@ -25,7 +25,7 @@ WITH `bfcte_0` AS ( `bfcol_11` AS `bfcol_19`, `bfcol_12` AS `bfcol_20`, `bfcol_13` AS `bfcol_21`, - INTERVAL (`bfcol_11` * 604800000000) MICROSECOND AS `bfcol_22` + `bfcol_11` * 604800000000 AS `bfcol_22` FROM `bfcte_2` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index 8f3af11842..d1023bd7bb 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -307,10 +307,13 @@ def test_invert(scalar_types_df: bpd.DataFrame, snapshot): def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col"]] - sql = _apply_unary_op(bf_df, ops.IsInOp(values=(1, 2, 3)), "int64_col") + sql = _apply_unary_op(bf_df, ops.IsInOp(values=(1, 2, 3)), "int64_col") snapshot.assert_match(sql, "out.sql") + sql = _apply_unary_op(bf_df, ops.IsInOp(values=()), "int64_col") + snapshot.assert_match(sql, "empty.sql") + def test_isalnum(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["string_col"]] From e5f53784b6ab8b4ebb6346e7b44a6d6d1888f7bf Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 28 Aug 2025 18:22:28 +0000 Subject: [PATCH 2/4] add new line --- .../core/compile/sqlglot/expressions/test_unary_compiler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index d1023bd7bb..970d658bf2 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -309,9 +309,11 @@ def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col"]] sql = _apply_unary_op(bf_df, ops.IsInOp(values=(1, 2, 3)), "int64_col") + snapshot.assert_match(sql, "out.sql") sql = _apply_unary_op(bf_df, ops.IsInOp(values=()), "int64_col") + snapshot.assert_match(sql, "empty.sql") From e8bfd928e5ba1c1dbc54a968ee70b44899d5dd4f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 28 Aug 2025 18:27:37 +0000 Subject: [PATCH 3/4] fix tests --- .../snapshots/test_binary_compiler/test_mul_timedelta/out.sql | 2 +- .../snapshots/test_binary_compiler/test_sub_timedelta/out.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql index c8a8cf6cbf..f8752d0a60 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql @@ -11,7 +11,7 @@ WITH `bfcte_0` AS ( `bfcol_1` AS `bfcol_8`, `bfcol_2` AS `bfcol_9`, `bfcol_0` AS `bfcol_10`, - INTERVAL `bfcol_3` MICROSECOND AS `bfcol_11` + `bfcol_3` AS `bfcol_11` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql index 460f941d1b..2d615fcca6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql @@ -11,7 +11,7 @@ WITH `bfcte_0` AS ( `bfcol_1` AS `bfcol_8`, `bfcol_2` AS `bfcol_9`, `bfcol_0` AS `bfcol_10`, - INTERVAL `bfcol_3` MICROSECOND AS `bfcol_11` + `bfcol_3` AS `bfcol_11` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT From 7ba11669041545b3490ef0b317a87c347770182b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 29 Aug 2025 20:38:39 +0000 Subject: [PATCH 4/4] complete isin compiler --- .../sqlglot/expressions/unary_compiler.py | 30 ++++++++- .../test_unary_compiler/test_is_in/empty.sql | 13 ---- .../test_unary_compiler/test_is_in/out.sql | 2 +- .../test_is_in_for_all_cases/out.sql | 61 +++++++++++++++++++ .../expressions/test_unary_compiler.py | 36 ++++++++++- 5 files changed, 123 insertions(+), 19 deletions(-) delete mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in_for_all_cases/out.sql diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index bf8c518770..f519aef70d 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -27,6 +27,7 @@ import bigframes.core.compile.sqlglot.expressions.constants as constants from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr +import bigframes.dtypes as dtypes UNARY_OP_REGISTRATION = OpRegistration() @@ -420,9 +421,28 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: @UNARY_OP_REGISTRATION.register(ops.IsInOp) def _(op: ops.IsInOp, expr: TypedExpr) -> sge.Expression: - if op.values is None or len(op.values) == 0: + values = [] + is_numeric_expr = dtypes.is_numeric(expr.dtype) + for value in op.values: + if value is None: + continue + dtype = dtypes.bigframes_type(type(value)) + if expr.dtype == dtype or is_numeric_expr and dtypes.is_numeric(dtype): + values.append(sge.convert(value)) + + if op.match_nulls: + contains_nulls = any(_is_null(value) for value in op.values) + if contains_nulls: + return sge.Is(this=expr.expr, expression=sge.Null()) | sge.In( + this=expr.expr, expressions=values + ) + + if len(values) == 0: return sge.convert(False) - return sge.In(this=expr.expr, expressions=[sge.convert(v) for v in op.values]) + + return sge.func( + "COALESCE", sge.In(this=expr.expr, expressions=values), sge.convert(False) + ) @UNARY_OP_REGISTRATION.register(ops.isalnum_op) @@ -868,3 +888,9 @@ def _(op: ops.ZfillOp, expr: TypedExpr) -> sge.Expression: ], default=sge.func("LPAD", expr.expr, sge.convert(op.width), sge.convert("0")), ) + + +# Helpers +def _is_null(value) -> bool: + # float NaN/inf should be treated as distinct from 'true' null values + return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql deleted file mode 100644 index ece9a459f6..0000000000 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/empty.sql +++ /dev/null @@ -1,13 +0,0 @@ -WITH `bfcte_0` AS ( - SELECT - `int64_col` AS `bfcol_0` - FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` -), `bfcte_1` AS ( - SELECT - *, - FALSE AS `bfcol_1` - FROM `bfcte_0` -) -SELECT - `bfcol_1` AS `int64_col` -FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql index 36941df71b..861dee75f7 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - `bfcol_0` IN (1, 2, 3) AS `bfcol_1` + COALESCE(`bfcol_0` IN (1, 2, 3), FALSE) AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in_for_all_cases/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in_for_all_cases/out.sql new file mode 100644 index 0000000000..823cee821c --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in_for_all_cases/out.sql @@ -0,0 +1,61 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `bytes_col` AS `bfcol_1`, + `date_col` AS `bfcol_2`, + `datetime_col` AS `bfcol_3`, + `geography_col` AS `bfcol_4`, + `int64_col` AS `bfcol_5`, + `int64_too` AS `bfcol_6`, + `numeric_col` AS `bfcol_7`, + `float64_col` AS `bfcol_8`, + `rowindex` AS `bfcol_9`, + `rowindex_2` AS `bfcol_10`, + `string_col` AS `bfcol_11`, + `time_col` AS `bfcol_12`, + `timestamp_col` AS `bfcol_13`, + `duration_col` AS `bfcol_14` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + COALESCE(`bfcol_5` IN (1, 2, 3), FALSE) AS `bfcol_31`, + ( + `bfcol_5` IS NULL + ) OR `bfcol_5` IN (123456) AS `bfcol_32`, + COALESCE(`bfcol_5` IN (123456), FALSE) AS `bfcol_33`, + COALESCE(`bfcol_5` IN (1.0, 2.0, 3.0), FALSE) AS `bfcol_34`, + FALSE AS `bfcol_35`, + COALESCE(`bfcol_5` IN (2.5, 3), FALSE) AS `bfcol_36`, + FALSE AS `bfcol_37`, + ( + `bfcol_8` IS NULL + ) OR `bfcol_8` IN (1, 2, 3) AS `bfcol_38` + FROM `bfcte_0` +) +SELECT + `bfcol_9` AS `bfuid_col_1`, + `bfcol_0` AS `bool_col`, + `bfcol_1` AS `bytes_col`, + `bfcol_2` AS `date_col`, + `bfcol_3` AS `datetime_col`, + `bfcol_4` AS `geography_col`, + `bfcol_5` AS `int64_col`, + `bfcol_6` AS `int64_too`, + `bfcol_7` AS `numeric_col`, + `bfcol_8` AS `float64_col`, + `bfcol_9` AS `rowindex`, + `bfcol_10` AS `rowindex_2`, + `bfcol_11` AS `string_col`, + `bfcol_12` AS `time_col`, + `bfcol_13` AS `timestamp_col`, + `bfcol_14` AS `duration_col`, + `bfcol_31` AS `int in ints`, + `bfcol_32` AS `int in ints w null`, + `bfcol_33` AS `int in ints w null wo match nulls`, + `bfcol_34` AS `int in floats`, + `bfcol_35` AS `int in strings`, + `bfcol_36` AS `int in mixed`, + `bfcol_37` AS `int in empty`, + `bfcol_38` AS `float in ints` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index 970d658bf2..e164439072 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -15,6 +15,7 @@ import pytest from bigframes import operations as ops +from bigframes.core import expression from bigframes.operations._op_converters import convert_index, convert_slice import bigframes.pandas as bpd @@ -307,14 +308,43 @@ def test_invert(scalar_types_df: bpd.DataFrame, snapshot): def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col"]] - sql = _apply_unary_op(bf_df, ops.IsInOp(values=(1, 2, 3)), "int64_col") snapshot.assert_match(sql, "out.sql") - sql = _apply_unary_op(bf_df, ops.IsInOp(values=()), "int64_col") - snapshot.assert_match(sql, "empty.sql") +def test_is_in_for_all_cases(scalar_types_df: bpd.DataFrame, snapshot): + scalars_array_value = scalar_types_df._block.expr + arr, col_ids = scalars_array_value.compute_values( + [ + ops.IsInOp((1, 2, 3)).as_expr(expression.deref("int64_col")), + ops.IsInOp((None, 123456)).as_expr(expression.deref("int64_col")), + ops.IsInOp((None, 123456), match_nulls=False).as_expr( + expression.deref("int64_col") + ), + ops.IsInOp((1.0, 2.0, 3.0)).as_expr(expression.deref("int64_col")), + ops.IsInOp(("1.0", "2.0")).as_expr(expression.deref("int64_col")), + ops.IsInOp(("1.0", 2.5, 3)).as_expr(expression.deref("int64_col")), + ops.IsInOp(()).as_expr(expression.deref("int64_col")), + ops.IsInOp((1, 2, 3, None)).as_expr(expression.deref("float64_col")), + ] + ) + new_names = ( + "int in ints", + "int in ints w null", + "int in ints w null wo match nulls", + "int in floats", + "int in strings", + "int in mixed", + "int in empty", + "float in ints", + ) + arr = arr.rename_columns( + {old_name: new_names[i] for i, old_name in enumerate(col_ids)} + ) + sql = arr.session._executor.to_sql(arr, enable_cache=False) + + snapshot.assert_match(sql, "out.sql") def test_isalnum(scalar_types_df: bpd.DataFrame, snapshot):