From c32c32eca47baf10569fbb3276295eef0380ee52 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 27 Oct 2023 17:06:03 +0000 Subject: [PATCH 1/6] test: add code snippets for loading data from BigQuery Job --- .../load_data_from_biquery_job_test.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 samples/snippets/load_data_from_biquery_job_test.py diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py new file mode 100644 index 0000000000..2039461b83 --- /dev/null +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -0,0 +1,37 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_bigquery_job(): + # [START bigquery_dataframes_load_data_from_bigquery_job] + from google.cloud import bigquery + + import bigframes.pandas as bpd + + # Project ID inserted based on the query results selected to explore + project = "bigframes-dev" + # Location inserted based on the query results selected to explore + location = "us" + client = bigquery.Client(project=project, location=location) + + # Job ID inserted based on the query results selcted to explore + job_id = "a9dbb6a9-db2d-46a5-a497-8cca2159ddeb" + job = client.get_job(job_id) + destination = str(job.destination) + + # Load data from a BigQuery table using BigFrames DataFrames: + bq_df = bpd.read_gbq_table(destination) + + # [END bigquery_dataframes_load_data_from_bigquery_job] + assert bq_df is not None From be02020daa86ab0cfd361e9664cc580d7bb93d00 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 27 Oct 2023 18:26:22 +0000 Subject: [PATCH 2/6] fix: address the comments --- samples/snippets/load_data_from_biquery_job_test.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py index 2039461b83..22100620e2 100644 --- a/samples/snippets/load_data_from_biquery_job_test.py +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -14,6 +14,16 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): + import bigframes.pandas as bpd + + bpd.options.bigquery.project = "bigframes-dev" + bpd.options.bigquery.location = "us" + + query_or_table = "bigquery-public-data.ml_datasets.penguins" + df = bpd.read_gbq(query_or_table) + df.to_pandas() + JOB_ID = df.query_job.job_id + # [START bigquery_dataframes_load_data_from_bigquery_job] from google.cloud import bigquery @@ -26,7 +36,7 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): client = bigquery.Client(project=project, location=location) # Job ID inserted based on the query results selcted to explore - job_id = "a9dbb6a9-db2d-46a5-a497-8cca2159ddeb" + job_id = JOB_ID job = client.get_job(job_id) destination = str(job.destination) From f774f7b8e906908b4d8e5b075dce90c94a54afda Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 27 Oct 2023 22:28:23 +0000 Subject: [PATCH 3/6] fix: fix the broken test --- samples/snippets/load_data_from_biquery_job_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py index 22100620e2..66a276cd21 100644 --- a/samples/snippets/load_data_from_biquery_job_test.py +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -16,9 +16,6 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): import bigframes.pandas as bpd - bpd.options.bigquery.project = "bigframes-dev" - bpd.options.bigquery.location = "us" - query_or_table = "bigquery-public-data.ml_datasets.penguins" df = bpd.read_gbq(query_or_table) df.to_pandas() From 02af8ab67d6cc03437197d9f6b9914867d5ee4f4 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 30 Oct 2023 18:26:36 +0000 Subject: [PATCH 4/6] use BigQuery Client library to get the job_id --- .../snippets/load_data_from_biquery_job_test.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py index 66a276cd21..8db333dfe9 100644 --- a/samples/snippets/load_data_from_biquery_job_test.py +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -14,12 +14,18 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): - import bigframes.pandas as bpd + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client(project="bigframes-dev", location="us") - query_or_table = "bigquery-public-data.ml_datasets.penguins" - df = bpd.read_gbq(query_or_table) - df.to_pandas() - JOB_ID = df.query_job.job_id + query = """ + SELECT * + FROM `bigquery-public-data.ml_datasets.penguins` + LIMIT 20 + """ + query_job = client.query(query) + JOB_ID = query_job.job_id # [START bigquery_dataframes_load_data_from_bigquery_job] from google.cloud import bigquery From 714845c921d691e121e154fa7794f2fc38d481b1 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 30 Oct 2023 18:56:14 +0000 Subject: [PATCH 5/6] feat: Implement operator `@` for `DataFrame.dot` (#139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 🦕 --- bigframes/dataframe.py | 2 ++ tests/system/small/test_dataframe.py | 33 +++++++++++++++++++++++++++ tests/system/small/test_multiindex.py | 16 +++++++++++++ 3 files changed, 51 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 49d7ad991a..3369fb4868 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2707,3 +2707,5 @@ def get_right_id(id): result = result[other.name].rename() return result + + __matmul__ = dot diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b503f9a31d..c96faa3526 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3264,6 +3264,23 @@ def test_df_dot( ) +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): @@ -3278,3 +3295,19 @@ def test_df_dot_series( bf_result, pd_result, ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d6bf46f77c..bc35f633fd 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -998,6 +998,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + # right multi-index right_index = pandas.MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "bb")]) bf1 = bpd.DataFrame(left_matrix) @@ -1005,6 +1008,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + def test_column_multi_index_dot_not_supported(): left_matrix = [[1, 2, 3], [2, 5, 7]] @@ -1022,6 +1028,11 @@ def test_column_multi_index_dot_not_supported(): ): bf1.dot(bf2) + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 + # right multi-columns bf1 = bpd.DataFrame(left_matrix) bf2 = bpd.DataFrame(right_matrix, columns=multi_level_columns) @@ -1029,3 +1040,8 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1.dot(bf2) + + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 From d96831509c64fa861700e795f2002ab088c59d85 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 30 Oct 2023 21:40:50 +0000 Subject: [PATCH 6/6] fix: fix the comments --- samples/snippets/load_data_from_biquery_job_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py index 8db333dfe9..5271574a49 100644 --- a/samples/snippets/load_data_from_biquery_job_test.py +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -26,6 +26,7 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): """ query_job = client.query(query) JOB_ID = query_job.job_id + your_project_id = "bigframes-dev" # [START bigquery_dataframes_load_data_from_bigquery_job] from google.cloud import bigquery @@ -33,7 +34,7 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): import bigframes.pandas as bpd # Project ID inserted based on the query results selected to explore - project = "bigframes-dev" + project = your_project_id # Location inserted based on the query results selected to explore location = "us" client = bigquery.Client(project=project, location=location)