diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 5826ab2871..1b3966b7bb 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -22,7 +22,7 @@ import pyarrow as pa import sqlglot.expressions as sge -from bigframes.core import expression, identifiers, nodes, rewrite +from bigframes.core import expression, guid, identifiers, nodes, rewrite from bigframes.core.compile import configs import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir @@ -33,6 +33,9 @@ class SQLGlotCompiler: """Compiles BigFrame nodes into SQL using SQLGlot.""" + uid_gen: guid.SequentialUIDGenerator = guid.SequentialUIDGenerator() + """Generator for unique identifiers.""" + def compile( self, node: nodes.BigFrameNode, @@ -107,10 +110,48 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult ) def _compile_result_node(self, root: nodes.ResultNode) -> str: - sqlglot_ir = compile_node(root.child) + sqlglot_ir = self.compile_node(root.child) # TODO: add order_by, limit, and selections to sqlglot_expr return sqlglot_ir.sql + @functools.lru_cache(maxsize=5000) + def compile_node(self, node: nodes.BigFrameNode) -> ir.SQLGlotIR: + """Compiles node into CompileArrayValue. Caches result.""" + return node.reduce_up(lambda node, children: self._compile_node(node, *children)) + + @functools.singledispatchmethod + def _compile_node( + self, node: nodes.BigFrameNode, *compiled_children: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + @_compile_node.register + def compile_readlocal(self, node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: + pa_table = node.local_data_source.data + pa_table = pa_table.select([item.source_id for item in node.scan_list.items]) + pa_table = pa_table.rename_columns( + [item.id.sql for item in node.scan_list.items] + ) + + offsets = node.offsets_col.sql if node.offsets_col else None + if offsets: + pa_table = pa_table.append_column( + offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) + ) + + return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema, uid_gen=self.uid_gen) + + @_compile_node.register + def compile_selection( + self, node: nodes.SelectionNode, child: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + select_cols: typing.Dict[str, sge.Expression] = { + id.name: scalar_compiler.compile_scalar_expression(expr) + for expr, id in node.input_output_pairs + } + return child.select(select_cols) + def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) @@ -128,46 +169,3 @@ def anonymous_column_ids() -> typing.Generator[identifiers.ColumnId, None, None] result_node, _ = rewrite.remap_variables(node, anonymous_column_ids()) return typing.cast(nodes.ResultNode, result_node) - - -@functools.lru_cache(maxsize=5000) -def compile_node(node: nodes.BigFrameNode) -> ir.SQLGlotIR: - """Compiles node into CompileArrayValue. Caches result.""" - return node.reduce_up(lambda node, children: _compile_node(node, *children)) - - -@functools.singledispatch -def _compile_node( - node: nodes.BigFrameNode, *compiled_children: ir.SQLGlotIR -) -> ir.SQLGlotIR: - """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unrecognized node: {node}") - - -@_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: - offsets = node.offsets_col.sql if node.offsets_col else None - schema_names = node.schema.names - schema_dtypes = node.schema.dtypes - - pa_table = node.local_data_source.data - pa_table = pa_table.select([item.source_id for item in node.scan_list.items]) - pa_table = pa_table.rename_columns( - {item.source_id: item.id.sql for item in node.scan_list.items} - ) - - if offsets: - pa_table = pa_table.append_column( - offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) - ) - - return ir.SQLGlotIR.from_pandas(pa_table.to_pandas(), schema_names, schema_dtypes) - - -@_compile_node.register -def compile_selection(node: nodes.SelectionNode, child: ir.SQLGlotIR) -> ir.SQLGlotIR: - select_cols: typing.Dict[str, sge.Expression] = { - id.name: scalar_compiler.compile_scalar_expression(expr) - for expr, id in node.input_output_pairs - } - return child.select(select_cols) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index fe85f22494..1037c0ace6 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -17,13 +17,25 @@ import dataclasses import typing -import pandas as pd +import pyarrow as pa import sqlglot as sg import sqlglot.dialects.bigquery import sqlglot.expressions as sge from bigframes import dtypes import bigframes.core.compile.sqlglot.sqlglot_types as sgt +import bigframes.core.local_data as local_data +import bigframes.core.schema as schemata + +from bigframes.core import guid + +# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. +try: + from shapely.io import to_wkt # type: ignore +except ImportError: + from shapely.wkt import dumps # type: ignore + + to_wkt = dumps @dataclasses.dataclass(frozen=True) @@ -42,41 +54,41 @@ class SQLGlotIR: pretty: bool = True """Whether to pretty-print the generated SQL.""" + uid_gen: guid.SequentialUIDGenerator = guid.SequentialUIDGenerator() + """Generator for unique identifiers.""" + @property def sql(self) -> str: """Generate SQL string from the given expression.""" return self.expr.sql(dialect=self.dialect, pretty=self.pretty) @classmethod - def from_pandas( - cls, - pd_df: pd.DataFrame, - schema_names: typing.Sequence[str], - schema_dtypes: typing.Sequence[dtypes.Dtype], + def from_pyarrow( + cls, pa_table: pa.Table, schema: schemata.ArraySchema, uid_gen: guid.SequentialUIDGenerator ) -> SQLGlotIR: """Builds SQLGlot expression from pyarrow table.""" dtype_expr = sge.DataType( this=sge.DataType.Type.STRUCT, expressions=[ sge.ColumnDef( - this=sge.to_identifier(name, quoted=True), - kind=sgt.SQLGlotType.from_bigframes_dtype(dtype), + this=sge.to_identifier(field.column, quoted=True), + kind=sgt.SQLGlotType.from_bigframes_dtype(field.dtype), ) - for name, dtype in zip(schema_names, schema_dtypes) + for field in schema.items ], nested=True, ) data_expr = [ - sge.Tuple( + sge.Struct( expressions=tuple( _literal( value=value, - dtype=sgt.SQLGlotType.from_bigframes_dtype(dtype), + dtype=field.dtype, ) - for value, dtype in zip(row, schema_dtypes) + for value, field in zip(tuple(row_dict.values()), schema.items) ) ) - for _, row in pd_df.iterrows() + for row_dict in local_data._iter_table(pa_table, schema) ] expr = sge.Unnest( expressions=[ @@ -88,7 +100,7 @@ def from_pandas( ), ], ) - return cls(expr=sg.select(sge.Star()).from_(expr)) + return cls(expr=sg.select(sge.Star()).from_(expr), uid_gen=uid_gen) def select( self, @@ -105,13 +117,36 @@ def select( return SQLGlotIR(expr=expr) -def _literal(value: typing.Any, dtype: str) -> sge.Expression: +def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: + sqlglot_type = sgt.SQLGlotType.from_bigframes_dtype(dtype) if value is None: - return _cast(sge.Null(), dtype) - - # TODO: handle other types like visit_DefaultLiteral - return sge.convert(value) + return _cast(sge.Null(), sqlglot_type) + elif dtype == dtypes.BYTES_DTYPE: + return _cast(str(value), sqlglot_type) + elif dtypes.is_time_like(dtype): + return _cast(sge.convert(value.isoformat()), sqlglot_type) + elif dtypes.is_geo_like(dtype): + wkt = value if isinstance(value, str) else to_wkt(value) + return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt)) + elif dtype == dtypes.JSON_DTYPE: + return sge.ParseJSON(this=sge.convert(str(value))) + elif dtypes.is_struct_like(dtype): + items = [ + _literal(value=value[field_name], dtype=field_dtype).as_( + field_name, quoted=True + ) + for field_name, field_dtype in dtypes.get_struct_fields(dtype).items() + ] + return sge.Struct.from_arg_list(items) + elif dtypes.is_array_like(dtype): + value_type = dtypes.get_array_inner_type(dtype) + values = sge.Array( + expressions=[_literal(value=v, dtype=value_type) for v in value] + ) + return values if len(value) > 0 else _cast(values, sqlglot_type) + else: + return sge.convert(value) -def _cast(arg, to) -> sge.Cast: +def _cast(arg: typing.Any, to: str) -> sge.Cast: return sge.Cast(this=arg, to=to) diff --git a/bigframes/core/guid.py b/bigframes/core/guid.py index 8930d0760a..cb3094c0e2 100644 --- a/bigframes/core/guid.py +++ b/bigframes/core/guid.py @@ -19,3 +19,21 @@ def generate_guid(prefix="col_"): global _GUID_COUNTER _GUID_COUNTER += 1 return f"bfuid_{prefix}{_GUID_COUNTER}" + + +class SequentialUIDGenerator: + """ + Generates sequential-like UIDs with multiple prefixes, e.g., "t0", "t1", "c0", "t2", etc. + """ + + def __init__(self): + self.prefix_counters = {} + + def generate_sequential_uid(self, prefix: str) -> str: + """Generates a sequential UID with specified prefix.""" + if prefix not in self.prefix_counters: + self.prefix_counters[prefix] = 0 + + uid = f"{prefix}{self.prefix_counters[prefix]}" + self.prefix_counters[prefix] += 1 + return uid diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 03755c94b7..2e5a1499b9 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -1,9 +1,9 @@ -{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} -{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} -{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} -{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file +{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} +{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "LINESTRING(-0.127959 51.507728, -0.127026 51.507473)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} +{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 23456fc76c..4d871fd707 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -12,29 +12,101 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib + import pandas as pd +import pyarrow as pa import pytest +from bigframes import dtypes +import tests.system.utils + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" + -@pytest.fixture(scope="module") +@pytest.fixture(scope="session") def compiler_session(): from . import compiler_session return compiler_session.SQLCompilerSession() -@pytest.fixture(scope="module") -def all_types_df() -> pd.DataFrame: - # TODO: all types pandas dataframes +@pytest.fixture(scope="session") +def scalars_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing all scalar types and using the `rowindex` + column as the index.""" # TODO: add tests for empty dataframes + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + tests.system.utils.convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + return df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing STRUCT types and using the `id` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) + return df + + +@pytest.fixture(scope="session") +def repeated_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing LIST types and using the `rowindex` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + df = df.set_index("rowindex") + return df + + +@pytest.fixture(scope="session") +def json_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing JSON types and using the `rowindex` + column as the index.""" + json_data = [ + "null", + "true", + "100", + "0.98", + '"a string"', + "[]", + "[1, 2, 3]", + '[{"a": 1}, {"a": 2}, {"a": null}, {}]', + '"100"', + '{"date": "2024-07-16"}', + '{"int_value": 2, "null_filed": null}', + '{"list_data": [10, 20, 30]}', + ] df = pd.DataFrame( { - "int1": pd.Series([1, 2, 3], dtype="Int64"), - "int2": pd.Series([-10, 20, 30], dtype="Int64"), - "bools": pd.Series([True, None, False], dtype="boolean"), - "strings": pd.Series(["b", "aa", "ccc"], dtype="string[pyarrow]"), + "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), }, + index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), ) - # add more complexity index. - df.index = df.index.astype("Int64") return df diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index c2a443dab2..0ef80dc8b0 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -1,7 +1,171 @@ SELECT - `bfcol_0` AS `bfcol_5`, - `bfcol_1` AS `bfcol_6`, - `bfcol_2` AS `bfcol_7`, - `bfcol_3` AS `bfcol_8`, - `bfcol_4` AS `bfcol_9` -FROM UNNEST(ARRAY>[(1, -10, TRUE, 'b', 0), (2, 20, CAST(NULL AS BOOLEAN), 'aa', 1), (3, 30, FALSE, 'ccc', 2)]) \ No newline at end of file + `bfcol_0` AS `bfcol_16`, + `bfcol_1` AS `bfcol_17`, + `bfcol_2` AS `bfcol_18`, + `bfcol_3` AS `bfcol_19`, + `bfcol_4` AS `bfcol_20`, + `bfcol_5` AS `bfcol_21`, + `bfcol_6` AS `bfcol_22`, + `bfcol_7` AS `bfcol_23`, + `bfcol_8` AS `bfcol_24`, + `bfcol_9` AS `bfcol_25`, + `bfcol_10` AS `bfcol_26`, + `bfcol_11` AS `bfcol_27`, + `bfcol_12` AS `bfcol_28`, + `bfcol_13` AS `bfcol_29`, + `bfcol_14` AS `bfcol_30`, + `bfcol_15` AS `bfcol_31` +FROM UNNEST(ARRAY>[STRUCT( + 0, + TRUE, + CAST(b'Hello, World!' AS BYTES), + CAST('2021-07-21' AS DATE), + CAST('2021-07-21T11:39:45' AS DATETIME), + ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'), + 123456789, + 0, + 1.234567890, + 1.25, + 0, + 0, + 'Hello, World!', + CAST('11:41:43.076160' AS TIME), + CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + 0 +), STRUCT( + 1, + FALSE, + CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), + CAST('1991-02-03' AS DATE), + CAST('1991-01-02T03:45:06' AS DATETIME), + ST_GEOGFROMTEXT('POINT (-71.104 42.315)'), + -987654321, + 1, + 1.234567890, + 2.51, + 1, + 1, + 'こんにちは', + CAST('11:14:34.701606' AS TIME), + CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + 1 +), STRUCT( + 2, + TRUE, + CAST(b'\xc2\xa1Hola Mundo!' AS BYTES), + CAST('2023-03-01' AS DATE), + CAST('2023-03-01T10:55:13' AS DATETIME), + ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'), + 314159, + 0, + 101.101010100, + 25000000000.0, + 2, + 2, + ' ¡Hola Mundo! ', + CAST('23:59:59.999999' AS TIME), + CAST('2023-03-01T10:55:13.250125+00:00' AS TIMESTAMP), + 2 +), STRUCT( + 3, + CAST(NULL AS BOOLEAN), + CAST(NULL AS BYTES), + CAST(NULL AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + CAST(NULL AS INT64), + 1, + CAST(NULL AS NUMERIC), + CAST(NULL AS FLOAT64), + 3, + 3, + CAST(NULL AS STRING), + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 3 +), STRUCT( + 4, + FALSE, + CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), + CAST('2021-07-21' AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + -234892, + -2345, + CAST(NULL AS NUMERIC), + CAST(NULL AS FLOAT64), + 4, + 4, + 'Hello, World!', + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 4 +), STRUCT( + 5, + FALSE, + CAST(b'G\xc3\xbcten Tag' AS BYTES), + CAST('1980-03-14' AS DATE), + CAST('1980-03-14T15:16:17' AS DATETIME), + CAST(NULL AS GEOGRAPHY), + 55555, + 0, + 5.555555000, + 555.555, + 5, + 5, + 'Güten Tag!', + CAST('15:16:17.181921' AS TIME), + CAST('1980-03-14T15:16:17.181921+00:00' AS TIMESTAMP), + 5 +), STRUCT( + 6, + TRUE, + CAST(b'Hello\tBigFrames!\x07' AS BYTES), + CAST('2023-05-23' AS DATE), + CAST('2023-05-23T11:37:01' AS DATETIME), + ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'), + 101202303, + 2, + -10.090807000, + -123.456, + 6, + 6, + 'capitalize, This ', + CAST('01:02:03.456789' AS TIME), + CAST('2023-05-23T11:42:55.000001+00:00' AS TIMESTAMP), + 6 +), STRUCT( + 7, + TRUE, + CAST(NULL AS BYTES), + CAST('2038-01-20' AS DATE), + CAST('2038-01-19T03:14:08' AS DATETIME), + CAST(NULL AS GEOGRAPHY), + -214748367, + 2, + 11111111.100000000, + 42.42, + 7, + 7, + ' سلام', + CAST('12:00:00.000001' AS TIME), + CAST('2038-01-19T03:14:17.999999+00:00' AS TIMESTAMP), + 7 +), STRUCT( + 8, + FALSE, + CAST(NULL AS BYTES), + CAST(NULL AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + 2, + 1, + CAST(NULL AS NUMERIC), + 6.87, + 8, + 8, + 'T', + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 8 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql new file mode 100644 index 0000000000..3b780e6d8e --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -0,0 +1,4 @@ +SELECT + `bfcol_0` AS `bfcol_2`, + `bfcol_1` AS `bfcol_3` +FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql new file mode 100644 index 0000000000..6998b41b27 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql @@ -0,0 +1,41 @@ +SELECT + `bfcol_0` AS `bfcol_9`, + `bfcol_1` AS `bfcol_10`, + `bfcol_2` AS `bfcol_11`, + `bfcol_3` AS `bfcol_12`, + `bfcol_4` AS `bfcol_13`, + `bfcol_5` AS `bfcol_14`, + `bfcol_6` AS `bfcol_15`, + `bfcol_7` AS `bfcol_16`, + `bfcol_8` AS `bfcol_17` +FROM UNNEST(ARRAY, `bfcol_2` ARRAY, `bfcol_3` ARRAY, `bfcol_4` ARRAY, `bfcol_5` ARRAY, `bfcol_6` ARRAY, `bfcol_7` ARRAY, `bfcol_8` INT64>>[STRUCT( + 0, + [1], + [TRUE], + [1.2, 2.3], + ['2021-07-21'], + ['2021-07-21 11:39:45'], + [1.2, 2.3, 3.4], + ['abc', 'de', 'f'], + 0 +), STRUCT( + 1, + [1, 2], + [TRUE, FALSE], + [1.1], + ['2021-07-21', '1987-03-28'], + ['1999-03-14 17:22:00'], + [5.5, 2.3], + ['a', 'bc', 'de'], + 1 +), STRUCT( + 2, + [1, 2, 3], + [TRUE], + [0.5, -1.9, 2.3], + ['2017-08-01', '2004-11-22'], + ['1979-06-03 03:20:45'], + [1.7000000000000002], + ['', 'a'], + 2 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql new file mode 100644 index 0000000000..42b7bc7361 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_nested_structs_df/out.sql @@ -0,0 +1,19 @@ +SELECT + * +FROM UNNEST(ARRAY>, `bfcol_2` INT64>>[( + 1, + STRUCT( + 'Alice' AS `name`, + 30 AS `age`, + STRUCT('New York' AS `city`, 'USA' AS `country`) AS `address` + ), + 0 +), ( + 2, + STRUCT( + 'Bob' AS `name`, + 25 AS `age`, + STRUCT('London' AS `city`, 'UK' AS `country`) AS `address` + ), + 1 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql new file mode 100644 index 0000000000..99b94915bf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -0,0 +1,21 @@ +SELECT + `bfcol_0` AS `bfcol_3`, + `bfcol_1` AS `bfcol_4`, + `bfcol_2` AS `bfcol_5` +FROM UNNEST(ARRAY>, `bfcol_2` INT64>>[STRUCT( + 1, + STRUCT( + 'Alice' AS `name`, + 30 AS `age`, + STRUCT('New York' AS `city`, 'USA' AS `country`) AS `address` + ), + 0 +), STRUCT( + 2, + STRUCT( + 'Bob' AS `name`, + 25 AS `age`, + STRUCT('London' AS `city`, 'UK' AS `country`) AS `address` + ), + 1 +)]) \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index ad7cc5785a..58587da129 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -22,7 +22,34 @@ def test_compile_readlocal( - all_types_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot + scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): - bf_df = bpd.DataFrame(all_types_df, session=compiler_session) + bf_df = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readlocal_w_structs_df( + nested_structs_pandas_df: pd.DataFrame, + compiler_session: bigframes.Session, + snapshot, +): + bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session) + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readlocal_w_lists_df( + repeated_pandas_df: pd.DataFrame, + compiler_session: bigframes.Session, + snapshot, +): + bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session) + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readlocal_w_json_df( + json_pandas_df: pd.DataFrame, + compiler_session: bigframes.Session, + snapshot, +): + bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 7e001d1ac3..be8f9fc555 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1067,7 +1067,6 @@ def visit_InMemoryTable(self, op, *, name, schema, data): columns=columns, ), ) - # return expr return sg.select(sge.Star()).from_(expr) def visit_ArrayAggregate(self, op, *, arg, order_by, where):