From a0025107df921396300c457024267156dfa636d4 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 17 Oct 2023 23:27:23 +0000 Subject: [PATCH 01/11] test: add sample code for use BigFrames developer guide --- samples/snippets/sample.py | 56 +++++++++++++++++++++++++++++++++ samples/snippets/sample_test.py | 31 ++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 samples/snippets/sample.py create mode 100644 samples/snippets/sample_test.py diff --git a/samples/snippets/sample.py b/samples/snippets/sample.py new file mode 100644 index 0000000000..03a66a34e7 --- /dev/null +++ b/samples/snippets/sample.py @@ -0,0 +1,56 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def use_bigquery_dataframes(project_id: str): + # [START bigquery_dataframes_set_options] + import bigframes.pandas as bpd + + PROJECT_ID = project_id # @param {type:"string"} + REGION = "US" # @param {type:"string"} + + # Set BigQuery DataFrames options + bpd.options.bigquery.project = PROJECT_ID + bpd.options.bigquery.location = REGION + + # [END bigquery_dataframes_set_options] + + # [START bigquery_dataframes_load_data_from_bigquery] + # Create a DataFrame from a BigQuery table: + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + # [END bigquery_dataframes_load_data_from_bigquery] + + # [START bigquery_dataframes_load_data_from_csv] + filepath_or_buffer = ( + "gs://bigquery-public-data-ml-datasets/holidays_and_events_for_forecasting.csv" + ) + df_from_gcs = bpd.read_csv(filepath_or_buffer) + # Display the first few rows of the DataFrame: + df_from_gcs.head() + # [END bigquery_dataframes_load_data_from_csv] + + # [START bigquery_dataframes_inspect_and_manipulate_data] + # Inspect one of the columns (or series) of the DataFrame: + bq_df["body_mass_g"].head(10) + + # Compute the mean of this series: + average_body_mass = bq_df["body_mass_g"].mean() + print(f"average_body_mass: {average_body_mass}") + + # Calculate the mean body_mass_g by species using the groupby operation: + bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() + # [END bigquery_dataframes_inspect_and_manipulate_data] + + # TODO(ashleyxu): Add samples for loading DataFrames to BigQuery table. diff --git a/samples/snippets/sample_test.py b/samples/snippets/sample_test.py new file mode 100644 index 0000000000..57376876d4 --- /dev/null +++ b/samples/snippets/sample_test.py @@ -0,0 +1,31 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas + +from . import sample + + +def test_remote_function_and_read_gbq_function( + capsys: pytest.CaptureFixture[str], +) -> None: + # We need a fresh session since we're modifying connection options. + bigframes.pandas.close_session() + + # TODO(swast): Get project from environment so contributors can run tests. + sample.use_bigquery_dataframes("bigframes-dev") + out, _ = capsys.readouterr() + assert "average_body_mass: " in out From f32eac8df4969c8fe03e0562424a6b16cd1346d8 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 19 Oct 2023 03:16:11 +0000 Subject: [PATCH 02/11] fix: create standalone, testable, and runnable code snippets --- ...ple_test.py => load_data_from_bigquery.py} | 23 +++++--------- samples/snippets/load_data_from_csv.py | 27 ++++++++++++++++ .../snippets/{sample.py => pandas_methods.py} | 31 +++---------------- samples/snippets/set_options.py | 29 +++++++++++++++++ 4 files changed, 68 insertions(+), 42 deletions(-) rename samples/snippets/{sample_test.py => load_data_from_bigquery.py} (54%) create mode 100644 samples/snippets/load_data_from_csv.py rename samples/snippets/{sample.py => pandas_methods.py} (50%) create mode 100644 samples/snippets/set_options.py diff --git a/samples/snippets/sample_test.py b/samples/snippets/load_data_from_bigquery.py similarity index 54% rename from samples/snippets/sample_test.py rename to samples/snippets/load_data_from_bigquery.py index 57376876d4..e4c65688bd 100644 --- a/samples/snippets/sample_test.py +++ b/samples/snippets/load_data_from_bigquery.py @@ -12,20 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import bigframes.pandas +def test_bigquery_dataframes_load_data_from_bigquery(): + # [START bigquery_dataframes_load_data_from_bigquery] + # Create a DataFrame from a BigQuery table: + import bigframes.pandas as bpd -from . import sample - - -def test_remote_function_and_read_gbq_function( - capsys: pytest.CaptureFixture[str], -) -> None: - # We need a fresh session since we're modifying connection options. - bigframes.pandas.close_session() - - # TODO(swast): Get project from environment so contributors can run tests. - sample.use_bigquery_dataframes("bigframes-dev") - out, _ = capsys.readouterr() - assert "average_body_mass: " in out + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + # [END bigquery_dataframes_load_data_from_bigquery] + assert bq_df is not None diff --git a/samples/snippets/load_data_from_csv.py b/samples/snippets/load_data_from_csv.py new file mode 100644 index 0000000000..d92af83d45 --- /dev/null +++ b/samples/snippets/load_data_from_csv.py @@ -0,0 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_csv(): + # [START bigquery_dataframes_load_data_from_csv] + import bigframes.pandas as bpd + + filepath_or_buffer = ( + "gs://bigquery-public-data-ml-datasets/holidays_and_events_for_forecasting.csv" + ) + df_from_gcs = bpd.read_csv(filepath_or_buffer) + # Display the first few rows of the DataFrame: + df_from_gcs.head() + # [END bigquery_dataframes_load_data_from_csv] + assert df_from_gcs is not None diff --git a/samples/snippets/sample.py b/samples/snippets/pandas_methods.py similarity index 50% rename from samples/snippets/sample.py rename to samples/snippets/pandas_methods.py index 03a66a34e7..f9b4b2f366 100644 --- a/samples/snippets/sample.py +++ b/samples/snippets/pandas_methods.py @@ -13,35 +13,14 @@ # limitations under the License. -def use_bigquery_dataframes(project_id: str): - # [START bigquery_dataframes_set_options] +def test_bigquery_dataframes_pandas_methods(): + # [START test_bigquery_dataframes_pandas_methods] import bigframes.pandas as bpd - PROJECT_ID = project_id # @param {type:"string"} - REGION = "US" # @param {type:"string"} - - # Set BigQuery DataFrames options - bpd.options.bigquery.project = PROJECT_ID - bpd.options.bigquery.location = REGION - - # [END bigquery_dataframes_set_options] - - # [START bigquery_dataframes_load_data_from_bigquery] - # Create a DataFrame from a BigQuery table: + # Load data from BigQuery query_or_table = "bigquery-public-data.ml_datasets.penguins" bq_df = bpd.read_gbq(query_or_table) - # [END bigquery_dataframes_load_data_from_bigquery] - # [START bigquery_dataframes_load_data_from_csv] - filepath_or_buffer = ( - "gs://bigquery-public-data-ml-datasets/holidays_and_events_for_forecasting.csv" - ) - df_from_gcs = bpd.read_csv(filepath_or_buffer) - # Display the first few rows of the DataFrame: - df_from_gcs.head() - # [END bigquery_dataframes_load_data_from_csv] - - # [START bigquery_dataframes_inspect_and_manipulate_data] # Inspect one of the columns (or series) of the DataFrame: bq_df["body_mass_g"].head(10) @@ -51,6 +30,4 @@ def use_bigquery_dataframes(project_id: str): # Calculate the mean body_mass_g by species using the groupby operation: bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() - # [END bigquery_dataframes_inspect_and_manipulate_data] - - # TODO(ashleyxu): Add samples for loading DataFrames to BigQuery table. + # [END test_bigquery_dataframes_pandas_methods] diff --git a/samples/snippets/set_options.py b/samples/snippets/set_options.py new file mode 100644 index 0000000000..f728d0cddd --- /dev/null +++ b/samples/snippets/set_options.py @@ -0,0 +1,29 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_set_options(): + # [START bigquery_dataframes_set_options] + import bigframes.pandas as bpd + + PROJECT_ID = "bigframes-dec" # @param {type:"string"} + REGION = "US" # @param {type:"string"} + + # Set BigQuery DataFrames options + bpd.options.bigquery.project = PROJECT_ID + bpd.options.bigquery.location = REGION + + # [END bigquery_dataframes_set_options] + assert bpd.options.bigquery.project == PROJECT_ID + assert bpd.options.bigquery.location == REGION From a6545a3683e8641906291406371a236fb44c5004 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 19 Oct 2023 03:20:17 +0000 Subject: [PATCH 03/11] add one minor assertion --- samples/snippets/pandas_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/snippets/pandas_methods.py b/samples/snippets/pandas_methods.py index f9b4b2f366..0b2f55ac6e 100644 --- a/samples/snippets/pandas_methods.py +++ b/samples/snippets/pandas_methods.py @@ -31,3 +31,4 @@ def test_bigquery_dataframes_pandas_methods(): # Calculate the mean body_mass_g by species using the groupby operation: bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() # [END test_bigquery_dataframes_pandas_methods] + assert average_body_mass == average_body_mass From 48a08dceda8ab618e440862c4d73ff7b92a3dc91 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 20 Oct 2023 04:10:10 +0000 Subject: [PATCH 04/11] fix: rename the snippet --- samples/snippets/apply_methods.py | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 samples/snippets/apply_methods.py diff --git a/samples/snippets/apply_methods.py b/samples/snippets/apply_methods.py new file mode 100644 index 0000000000..7a7f888c64 --- /dev/null +++ b/samples/snippets/apply_methods.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_apply_methods(): + # [START test_bigquery_dataframes_apply_methods] + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Inspect one of the columns (or series) of the DataFrame: + bq_df["body_mass_g"].head(10) + + # Compute the mean of this series: + average_body_mass = bq_df["body_mass_g"].mean() + print(f"average_body_mass: {average_body_mass}") + + # Calculate the mean body_mass_g by species using the groupby operation: + bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() + # [END test_bigquery_dataframes_apply_methods] + assert average_body_mass == average_body_mass From 7a34910da34a9f071a32f618ba0e374e9e5fe754 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 20 Oct 2023 04:15:30 +0000 Subject: [PATCH 05/11] fix the snippet name --- samples/snippets/apply_methods.py | 34 ------------------------------ samples/snippets/pandas_methods.py | 4 ++-- 2 files changed, 2 insertions(+), 36 deletions(-) delete mode 100644 samples/snippets/apply_methods.py diff --git a/samples/snippets/apply_methods.py b/samples/snippets/apply_methods.py deleted file mode 100644 index 7a7f888c64..0000000000 --- a/samples/snippets/apply_methods.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_bigquery_dataframes_apply_methods(): - # [START test_bigquery_dataframes_apply_methods] - import bigframes.pandas as bpd - - # Load data from BigQuery - query_or_table = "bigquery-public-data.ml_datasets.penguins" - bq_df = bpd.read_gbq(query_or_table) - - # Inspect one of the columns (or series) of the DataFrame: - bq_df["body_mass_g"].head(10) - - # Compute the mean of this series: - average_body_mass = bq_df["body_mass_g"].mean() - print(f"average_body_mass: {average_body_mass}") - - # Calculate the mean body_mass_g by species using the groupby operation: - bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() - # [END test_bigquery_dataframes_apply_methods] - assert average_body_mass == average_body_mass diff --git a/samples/snippets/pandas_methods.py b/samples/snippets/pandas_methods.py index 0b2f55ac6e..153d37ca74 100644 --- a/samples/snippets/pandas_methods.py +++ b/samples/snippets/pandas_methods.py @@ -14,7 +14,7 @@ def test_bigquery_dataframes_pandas_methods(): - # [START test_bigquery_dataframes_pandas_methods] + # [START bigquery_dataframes_pandas_methods] import bigframes.pandas as bpd # Load data from BigQuery @@ -30,5 +30,5 @@ def test_bigquery_dataframes_pandas_methods(): # Calculate the mean body_mass_g by species using the groupby operation: bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() - # [END test_bigquery_dataframes_pandas_methods] + # [END bigquery_dataframes_pandas_methods] assert average_body_mass == average_body_mass From 7f3e202b2b0dd4865b3295908665c85e43faff28 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 20 Oct 2023 10:52:49 -0500 Subject: [PATCH 06/11] feat: add back `reset_session` as an alias for `close_session` (#124) --- bigframes/pandas/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1b9144fb62..24b19fa70a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -452,6 +452,7 @@ def read_gbq_function(function_name: str): # Session management APIs get_global_session = global_session.get_global_session close_session = global_session.close_session +reset_session = global_session.close_session # Use __all__ to let type checkers know what is part of the public API. From 2a4fba47dcc83a41680913f627b9e3e909513fba Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 20 Oct 2023 16:30:21 +0000 Subject: [PATCH 07/11] fix: add to filename --- ...load_data_from_bigquery.py => load_data_from_bigquery_test.py} | 0 .../{load_data_from_csv.py => load_data_from_csv_test.py} | 0 samples/snippets/{pandas_methods.py => pandas_methods_test.py} | 0 samples/snippets/{set_options.py => set_options_test.py} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename samples/snippets/{load_data_from_bigquery.py => load_data_from_bigquery_test.py} (100%) rename samples/snippets/{load_data_from_csv.py => load_data_from_csv_test.py} (100%) rename samples/snippets/{pandas_methods.py => pandas_methods_test.py} (100%) rename samples/snippets/{set_options.py => set_options_test.py} (100%) diff --git a/samples/snippets/load_data_from_bigquery.py b/samples/snippets/load_data_from_bigquery_test.py similarity index 100% rename from samples/snippets/load_data_from_bigquery.py rename to samples/snippets/load_data_from_bigquery_test.py diff --git a/samples/snippets/load_data_from_csv.py b/samples/snippets/load_data_from_csv_test.py similarity index 100% rename from samples/snippets/load_data_from_csv.py rename to samples/snippets/load_data_from_csv_test.py diff --git a/samples/snippets/pandas_methods.py b/samples/snippets/pandas_methods_test.py similarity index 100% rename from samples/snippets/pandas_methods.py rename to samples/snippets/pandas_methods_test.py diff --git a/samples/snippets/set_options.py b/samples/snippets/set_options_test.py similarity index 100% rename from samples/snippets/set_options.py rename to samples/snippets/set_options_test.py From a6d79e792b3c3da71cf86cf84cd785707d47952b Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 20 Oct 2023 10:55:16 -0700 Subject: [PATCH 08/11] Update samples/snippets/pandas_methods_test.py Co-authored-by: Tim Swast --- samples/snippets/pandas_methods_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/pandas_methods_test.py b/samples/snippets/pandas_methods_test.py index 153d37ca74..1f472d6346 100644 --- a/samples/snippets/pandas_methods_test.py +++ b/samples/snippets/pandas_methods_test.py @@ -31,4 +31,4 @@ def test_bigquery_dataframes_pandas_methods(): # Calculate the mean body_mass_g by species using the groupby operation: bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() # [END bigquery_dataframes_pandas_methods] - assert average_body_mass == average_body_mass + assert average_body_mass is not None From 42ec25d03d4478e181892962aff3191bdbf5b979 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 20 Oct 2023 22:27:22 -0700 Subject: [PATCH 09/11] docs: add runnable code samples for reading methods (#125) * docs: add runnable and testable I/O code samples * docs: add runnable and testable reading methods code snippets * fix: assign a df and show the first 2 rows * address comments --- bigframes/session/__init__.py | 100 ++++++++++++++++++ .../bigframes_vendored/pandas/io/gbq.py | 24 +++-- .../bigframes_vendored/pandas/io/parquet.py | 14 +++ .../pandas/io/parsers/readers.py | 30 +++++- .../bigframes_vendored/pandas/io/pickle.py | 18 ++++ 5 files changed, 179 insertions(+), 7 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2f001d7d49..5ec3da1a5a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -342,6 +342,51 @@ def read_gbq_query( ``row_number() over ()`` if there is no natural unique index or you want to preserve ordering. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Simple query input: + + >>> df = bpd.read_gbq_query(''' + ... SELECT + ... pitcherFirstName, + ... pitcherLastName, + ... pitchSpeed, + ... FROM `bigquery-public-data.baseball.games_wide` + ... ''') + >>> df.head(2) + pitcherFirstName pitcherLastName pitchSpeed + 0 0 + 1 0 + + [2 rows x 3 columns] + + Preserve ordering in a query input. + + >>> df = bpd.read_gbq_query(''' + ... SELECT + ... -- Instead of an ORDER BY clause on the query, use + ... -- ROW_NUMBER() to create an ordered DataFrame. + ... ROW_NUMBER() OVER (ORDER BY AVG(pitchSpeed) DESC) + ... AS rowindex, + ... + ... pitcherFirstName, + ... pitcherLastName, + ... AVG(pitchSpeed) AS averagePitchSpeed + ... FROM `bigquery-public-data.baseball.games_wide` + ... WHERE year = 2016 + ... GROUP BY pitcherFirstName, pitcherLastName + ... ''', index_col="rowindex") + >>> df.head(2) + pitcherFirstName pitcherLastName averagePitchSpeed + rowindex + 1 Albertin Chapman 96.514113 + 2 Zachary Britton 94.591039 + + [2 rows x 3 columns] + See also: :meth:`Session.read_gbq`. """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so @@ -405,6 +450,25 @@ def read_gbq_table( ) -> dataframe.DataFrame: """Turn a BigQuery table into a DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). + + >>> df = bpd.read_gbq_table("bigquery-public-data.ml_datasets.penguins") + >>> df.head(2) + species island culmen_length_mm \\ + 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 + 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 + + culmen_depth_mm flipper_length_mm body_mass_g sex + 0 18.4 184.0 3475.0 FEMALE + 1 19.1 184.0 4650.0 MALE + + [2 rows x 7 columns] + See also: :meth:`Session.read_gbq`. """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so @@ -792,6 +856,16 @@ def _read_ibis( def read_gbq_model(self, model_name: str): """Loads a BigQuery ML model from BigQuery. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Read an existing BigQuery ML model. + + >>> model_name = "bigframes-dev.bqml_tutorial.penguins_model" + >>> model = bpd.read_gbq_model(model_name) + Args: model_name (str): the model's name in BigQuery in the format @@ -815,6 +889,22 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame The pandas DataFrame will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> pandas_df = pd.DataFrame(data=d) + >>> df = bpd.read_pandas(pandas_df) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + Args: pandas_dataframe (pandas.DataFrame): a pandas DataFrame object to be loaded. @@ -1365,6 +1455,16 @@ def read_gbq_function( The return type of the function must be explicitly specified in the function's original definition even if not otherwise required. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" + >>> func = bpd.read_gbq_function(function_name=function_name) + >>> func.bigframes_remote_function + 'bqutil.fn.cw_lower_case_ascii_only' + Args: function_name (str): the function's name in BigQuery in the format diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 8919f4ed16..575c501618 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -42,9 +42,23 @@ def read_gbq( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + If the input is a table ID: + + >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + >>> df.head(2) + species island culmen_length_mm \\ + 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 + 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 + + culmen_depth_mm flipper_length_mm body_mass_g sex + 0 18.4 184.0 3475.0 FEMALE + 1 19.1 184.0 4650.0 MALE + + [2 rows x 7 columns] + Preserve ordering in a query input. - >>> bpd.read_gbq(''' + >>> df = bpd.read_gbq(''' ... SELECT ... -- Instead of an ORDER BY clause on the query, use ... -- ROW_NUMBER() to create an ordered DataFrame. @@ -57,16 +71,14 @@ def read_gbq( ... FROM `bigquery-public-data.baseball.games_wide` ... WHERE year = 2016 ... GROUP BY pitcherFirstName, pitcherLastName - ... ''', index_col="rowindex").head(n=5) + ... ''', index_col="rowindex") + >>> df.head(2) pitcherFirstName pitcherLastName averagePitchSpeed rowindex 1 Albertin Chapman 96.514113 2 Zachary Britton 94.591039 - 3 Trevor Rosenthal 94.213953 - 4 Jose Torres 94.103448 - 5 Tayron Guerrero 93.863636 - [5 rows x 3 columns] + [2 rows x 3 columns] Args: query_or_table (str): diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 9aed9af5a8..f97bd386a4 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -17,6 +17,20 @@ def read_parquet( Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" + >>> df = bpd.read_parquet(path=gcs_path) + >>> df.head(2) + name post_abbr + 0 Alabama AL + 1 Alaska AK + + [2 rows x 2 columns] + Args: path (str): Local or Cloud Storage path to Parquet file. diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index d19a92ecdf..e8ed6182a6 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -55,6 +55,20 @@ def read_csv( file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" + >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) + >>> df.head(2) + name post_abbr + 0 Alabama AL + 1 Alaska AK + + [2 rows x 2 columns] + Args: filepath_or_buffer (str): A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` @@ -64,7 +78,7 @@ def read_csv( can be any ISO-8859-1 single-byte character. To use a character in the range 128-255, you must encode the character as UTF-8. Both engines support `sep="\t"` to specify tab character as separator. Default engine supports - having any number of spaces as separator by specifying `sep="\s+"`. Separators + having any number of spaces as separator by specifying `sep="\\s+"`. Separators longer than 1 character are interpreted as regular expressions by the default engine. BigQuery engine only supports single character separators. header (Optional[int], default 0): @@ -146,6 +160,20 @@ def read_json( file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" + >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") + >>> df.head(2) + id name + 0 1 Alice + 1 2 Bob + + [2 rows x 2 columns] + Args: path_or_buf (a valid JSON str, path object or file-like object): A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 71b31956a0..053ba4871c 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -25,6 +25,24 @@ def read_pickle( If the content of the pickle file is a Series and its name attribute is None, the name will be set to '0' by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" + >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) + >>> df.head(2) + species island culmen_length_mm \\ + 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 + 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 + + culmen_depth_mm flipper_length_mm body_mass_g sex + 0 18.4 184.0 3475.0 FEMALE + 1 19.1 184.0 4650.0 MALE + + [2 rows x 7 columns] + Args: filepath_or_buffer (str, path object, or file-like object): String, path object (implementing os.PathLike[str]), or file-like object From 19cdf38a8a5bb81b03fff920bea1894ab576753a Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 20 Oct 2023 16:30:21 +0000 Subject: [PATCH 10/11] fix: change to public csv bucket --- samples/snippets/load_data_from_csv_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/snippets/load_data_from_csv_test.py b/samples/snippets/load_data_from_csv_test.py index d92af83d45..31ab9255bf 100644 --- a/samples/snippets/load_data_from_csv_test.py +++ b/samples/snippets/load_data_from_csv_test.py @@ -17,9 +17,7 @@ def test_bigquery_dataframes_load_data_from_csv(): # [START bigquery_dataframes_load_data_from_csv] import bigframes.pandas as bpd - filepath_or_buffer = ( - "gs://bigquery-public-data-ml-datasets/holidays_and_events_for_forecasting.csv" - ) + filepath_or_buffer = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" df_from_gcs = bpd.read_csv(filepath_or_buffer) # Display the first few rows of the DataFrame: df_from_gcs.head() From e8f2d6a4ba2f44edfab3caf303de9cfe2fa67a40 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Sat, 21 Oct 2023 16:24:08 +0000 Subject: [PATCH 11/11] fix: close the session before resetting --- samples/snippets/set_options_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py index f728d0cddd..ef6f41ce54 100644 --- a/samples/snippets/set_options_test.py +++ b/samples/snippets/set_options_test.py @@ -14,6 +14,11 @@ def test_bigquery_dataframes_set_options(): + # Close the session before resetting the options + import bigframes.pandas as bpd + + bpd.close_session() + # [START bigquery_dataframes_set_options] import bigframes.pandas as bpd