From b58676f32f616f47b27c2479fc1b793c4c7af693 Mon Sep 17 00:00:00 2001 From: Shachar Snapiri Date: Sun, 26 Nov 2023 16:27:32 +0200 Subject: [PATCH 1/3] fix: avoid implicit join when using join with unnest When using JOIN with UNNEST statements, and then creating a SELECT statement based on it, the UNNESTed table will appear twice in the FROM clause, causing an implicit join of the table with itself --- sqlalchemy_bigquery/base.py | 8 ++++ tests/unit/test_compiler.py | 84 +++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index 5297f223..0a6f2c67 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -269,6 +269,14 @@ def _known_tables(self): if table is not None: known_tables.add(table.name) + # If we have the table in the `from` of our parent, do not add the alias + # as this will add the table twice and cause an implicit JOIN for that + # table on itself + if self.stack: + for from_ in self.stack[-1]["asfrom_froms"]: + if isinstance(from_, Table): + known_tables.add(from_.name) + return known_tables def visit_column( diff --git a/tests/unit/test_compiler.py b/tests/unit/test_compiler.py index db02e593..10568431 100644 --- a/tests/unit/test_compiler.py +++ b/tests/unit/test_compiler.py @@ -114,3 +114,87 @@ def test_no_alias_for_known_tables_cte(faux_conn, metadata): ) found_cte_sql = q.compile(faux_conn).string assert found_cte_sql == expected_cte_sql + + +@sqlalchemy_1_4_or_higher +def test_no_implicit_join_for_inner_unnest(faux_conn, metadata): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + table1 = setup_table( + faux_conn, "table1", metadata, sqlalchemy.Column("foo", sqlalchemy.Integer) + ) + table2 = setup_table( + faux_conn, + "table2", + metadata, + sqlalchemy.Column("foos", sqlalchemy.ARRAY(sqlalchemy.Integer)), + sqlalchemy.Column("bar", sqlalchemy.Integer), + ) + F = sqlalchemy.func + + unnested_col_name = "unnested_foos" + unnested_foos = F.unnest(table2.c.foos).alias(unnested_col_name) + unnested_foo_col = sqlalchemy.Column(unnested_col_name) + + # Set up initial query + q = sqlalchemy.select(table1.c.foo, table2.c.bar).select_from( + unnested_foos.join(table1, table1.c.foo == unnested_foo_col) + ) + + expected_initial_sql = ( + "SELECT `table1`.`foo`, `table2`.`bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" + ) + found_initial_sql = q.compile(faux_conn).string + assert found_initial_sql == expected_initial_sql + + q = sqlalchemy.select("*").select_from(q.subquery()) + + expected_outer_sql = ( + "SELECT * \n" + "FROM (SELECT `table1`.`foo` AS `foo`, `table2`.`bar` AS `bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`) AS `anon_1`" + ) + found_outer_sql = q.compile(faux_conn).string + assert found_outer_sql == expected_outer_sql + + +@sqlalchemy_1_4_or_higher +def test_no_implicit_join_for_inner_unnest_2(faux_conn, metadata): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + table1 = setup_table( + faux_conn, "table1", metadata, sqlalchemy.Column("foo", sqlalchemy.Integer) + ) + table2 = setup_table( + faux_conn, + "table2", + metadata, + sqlalchemy.Column("foos", sqlalchemy.ARRAY(sqlalchemy.Integer)), + sqlalchemy.Column("bar", sqlalchemy.Integer), + ) + F = sqlalchemy.func + + unnested_col_name = "unnested_foos" + unnested_foos = F.unnest(table2.c.foos).alias(unnested_col_name) + unnested_foo_col = sqlalchemy.Column(unnested_col_name) + + # Set up initial query + q = sqlalchemy.select(table1.c.foo).select_from( + unnested_foos.join(table1, table1.c.foo == unnested_foo_col) + ) + + expected_initial_sql = ( + "SELECT `table1`.`foo` \n" + "FROM `table2` `table2_1`, unnest(`table2_1`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" + ) + found_initial_sql = q.compile(faux_conn).string + assert found_initial_sql == expected_initial_sql + + q = sqlalchemy.select("*").select_from(q.subquery()) + + expected_outer_sql = ( + "SELECT * \n" + "FROM (SELECT `table1`.`foo` AS `foo` \n" + "FROM `table2` `table2_1`, unnest(`table2_1`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`) AS `anon_1`" + ) + found_outer_sql = q.compile(faux_conn).string + assert found_outer_sql == expected_outer_sql From 07c97f4e384aca3bf5842a0ff182b137123f2590 Mon Sep 17 00:00:00 2001 From: Shachar Snapiri Date: Sun, 26 Nov 2023 17:32:13 +0200 Subject: [PATCH 2/3] Add safety checks --- sqlalchemy_bigquery/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index 0a6f2c67..f31a0e46 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -273,9 +273,10 @@ def _known_tables(self): # as this will add the table twice and cause an implicit JOIN for that # table on itself if self.stack: - for from_ in self.stack[-1]["asfrom_froms"]: - if isinstance(from_, Table): - known_tables.add(from_.name) + if (asfrom_froms := self.stack[-1].get("asfrom_froms")) is not None: + for from_ in asfrom_froms: + if isinstance(from_, Table): + known_tables.add(from_.name) return known_tables From 8467014bc0f3b7999e1c13ff513f43903b1be8d2 Mon Sep 17 00:00:00 2001 From: Shachar Snapiri Date: Tue, 12 Dec 2023 10:41:17 +0200 Subject: [PATCH 3/3] Add tests and fix cover --- sqlalchemy_bigquery/base.py | 9 ++- tests/unit/test_compiler.py | 130 +++++++++++++++++++++++++++++------- 2 files changed, 109 insertions(+), 30 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index f31a0e46..03488250 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -272,11 +272,10 @@ def _known_tables(self): # If we have the table in the `from` of our parent, do not add the alias # as this will add the table twice and cause an implicit JOIN for that # table on itself - if self.stack: - if (asfrom_froms := self.stack[-1].get("asfrom_froms")) is not None: - for from_ in asfrom_froms: - if isinstance(from_, Table): - known_tables.add(from_.name) + asfrom_froms = self.stack[-1].get("asfrom_froms", []) + for from_ in asfrom_froms: + if isinstance(from_, Table): + known_tables.add(from_.name) return known_tables diff --git a/tests/unit/test_compiler.py b/tests/unit/test_compiler.py index 10568431..139b6cbc 100644 --- a/tests/unit/test_compiler.py +++ b/tests/unit/test_compiler.py @@ -21,7 +21,7 @@ import sqlalchemy.exc from .conftest import setup_table -from .conftest import sqlalchemy_1_4_or_higher +from .conftest import sqlalchemy_1_4_or_higher, sqlalchemy_before_1_4 def test_constraints_are_ignored(faux_conn, metadata): @@ -116,9 +116,9 @@ def test_no_alias_for_known_tables_cte(faux_conn, metadata): assert found_cte_sql == expected_cte_sql -@sqlalchemy_1_4_or_higher -def test_no_implicit_join_for_inner_unnest(faux_conn, metadata): - # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 +def prepare_implicit_join_base_query( + faux_conn, metadata, select_from_table2, old_syntax +): table1 = setup_table( faux_conn, "table1", metadata, sqlalchemy.Column("foo", sqlalchemy.Integer) ) @@ -136,10 +136,38 @@ def test_no_implicit_join_for_inner_unnest(faux_conn, metadata): unnested_foo_col = sqlalchemy.Column(unnested_col_name) # Set up initial query - q = sqlalchemy.select(table1.c.foo, table2.c.bar).select_from( - unnested_foos.join(table1, table1.c.foo == unnested_foo_col) + cols = [table1.c.foo, table2.c.bar] if select_from_table2 else [table1.c.foo] + q = sqlalchemy.select(cols) if old_syntax else sqlalchemy.select(*cols) + q = q.select_from(unnested_foos.join(table1, table1.c.foo == unnested_foo_col)) + return q + + +@sqlalchemy_before_1_4 +def test_no_implicit_join_asterix_for_inner_unnest_before_1_4(faux_conn, metadata): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + q = prepare_implicit_join_base_query(faux_conn, metadata, True, True) + expected_initial_sql = ( + "SELECT `table1`.`foo`, `table2`.`bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" ) + found_initial_sql = q.compile(faux_conn).string + assert found_initial_sql == expected_initial_sql + q = sqlalchemy.select(["*"]).select_from(q) + + expected_outer_sql = ( + "SELECT * \n" + "FROM (SELECT `table1`.`foo` AS `foo`, `table2`.`bar` AS `bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`)" + ) + found_outer_sql = q.compile(faux_conn).string + assert found_outer_sql == expected_outer_sql + + +@sqlalchemy_1_4_or_higher +def test_no_implicit_join_asterix_for_inner_unnest(faux_conn, metadata): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + q = prepare_implicit_join_base_query(faux_conn, metadata, True, False) expected_initial_sql = ( "SELECT `table1`.`foo`, `table2`.`bar` \n" "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" @@ -147,7 +175,8 @@ def test_no_implicit_join_for_inner_unnest(faux_conn, metadata): found_initial_sql = q.compile(faux_conn).string assert found_initial_sql == expected_initial_sql - q = sqlalchemy.select("*").select_from(q.subquery()) + q = q.subquery() + q = sqlalchemy.select("*").select_from(q) expected_outer_sql = ( "SELECT * \n" @@ -158,30 +187,57 @@ def test_no_implicit_join_for_inner_unnest(faux_conn, metadata): assert found_outer_sql == expected_outer_sql -@sqlalchemy_1_4_or_higher -def test_no_implicit_join_for_inner_unnest_2(faux_conn, metadata): +@sqlalchemy_before_1_4 +def test_no_implicit_join_for_inner_unnest_before_1_4(faux_conn, metadata): # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 - table1 = setup_table( - faux_conn, "table1", metadata, sqlalchemy.Column("foo", sqlalchemy.Integer) + q = prepare_implicit_join_base_query(faux_conn, metadata, True, True) + expected_initial_sql = ( + "SELECT `table1`.`foo`, `table2`.`bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" ) - table2 = setup_table( - faux_conn, - "table2", - metadata, - sqlalchemy.Column("foos", sqlalchemy.ARRAY(sqlalchemy.Integer)), - sqlalchemy.Column("bar", sqlalchemy.Integer), + found_initial_sql = q.compile(faux_conn).string + assert found_initial_sql == expected_initial_sql + + q = sqlalchemy.select([q.c.foo]).select_from(q) + + expected_outer_sql = ( + "SELECT `foo` \n" + "FROM (SELECT `table1`.`foo` AS `foo`, `table2`.`bar` AS `bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`)" ) - F = sqlalchemy.func + found_outer_sql = q.compile(faux_conn).string + assert found_outer_sql == expected_outer_sql - unnested_col_name = "unnested_foos" - unnested_foos = F.unnest(table2.c.foos).alias(unnested_col_name) - unnested_foo_col = sqlalchemy.Column(unnested_col_name) - # Set up initial query - q = sqlalchemy.select(table1.c.foo).select_from( - unnested_foos.join(table1, table1.c.foo == unnested_foo_col) +@sqlalchemy_1_4_or_higher +def test_no_implicit_join_for_inner_unnest(faux_conn, metadata): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + q = prepare_implicit_join_base_query(faux_conn, metadata, True, False) + expected_initial_sql = ( + "SELECT `table1`.`foo`, `table2`.`bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" ) + found_initial_sql = q.compile(faux_conn).string + assert found_initial_sql == expected_initial_sql + + q = q.subquery() + q = sqlalchemy.select(q.c.foo).select_from(q) + + expected_outer_sql = ( + "SELECT `anon_1`.`foo` \n" + "FROM (SELECT `table1`.`foo` AS `foo`, `table2`.`bar` AS `bar` \n" + "FROM `table2`, unnest(`table2`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`) AS `anon_1`" + ) + found_outer_sql = q.compile(faux_conn).string + assert found_outer_sql == expected_outer_sql + +@sqlalchemy_1_4_or_higher +def test_no_implicit_join_asterix_for_inner_unnest_no_table2_column( + faux_conn, metadata +): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + q = prepare_implicit_join_base_query(faux_conn, metadata, False, False) expected_initial_sql = ( "SELECT `table1`.`foo` \n" "FROM `table2` `table2_1`, unnest(`table2_1`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" @@ -189,7 +245,8 @@ def test_no_implicit_join_for_inner_unnest_2(faux_conn, metadata): found_initial_sql = q.compile(faux_conn).string assert found_initial_sql == expected_initial_sql - q = sqlalchemy.select("*").select_from(q.subquery()) + q = q.subquery() + q = sqlalchemy.select("*").select_from(q) expected_outer_sql = ( "SELECT * \n" @@ -198,3 +255,26 @@ def test_no_implicit_join_for_inner_unnest_2(faux_conn, metadata): ) found_outer_sql = q.compile(faux_conn).string assert found_outer_sql == expected_outer_sql + + +@sqlalchemy_1_4_or_higher +def test_no_implicit_join_for_inner_unnest_no_table2_column(faux_conn, metadata): + # See: https://github.com/googleapis/python-bigquery-sqlalchemy/issues/368 + q = prepare_implicit_join_base_query(faux_conn, metadata, False, False) + expected_initial_sql = ( + "SELECT `table1`.`foo` \n" + "FROM `table2` `table2_1`, unnest(`table2_1`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`" + ) + found_initial_sql = q.compile(faux_conn).string + assert found_initial_sql == expected_initial_sql + + q = q.subquery() + q = sqlalchemy.select(q.c.foo).select_from(q) + + expected_outer_sql = ( + "SELECT `anon_1`.`foo` \n" + "FROM (SELECT `table1`.`foo` AS `foo` \n" + "FROM `table2` `table2_1`, unnest(`table2_1`.`foos`) AS `unnested_foos` JOIN `table1` ON `table1`.`foo` = `unnested_foos`) AS `anon_1`" + ) + found_outer_sql = q.compile(faux_conn).string + assert found_outer_sql == expected_outer_sql