From e8532b1d999d26ea1ebdd30efb8f2c0a93a6a28d Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 16 Nov 2023 10:08:27 -0800 Subject: [PATCH 01/26] fix: polish the llm+kmeans notebook (#208) --- .../bq_dataframes_llm_code_generation.ipynb | 2 +- .../bq_dataframes_llm_kmeans.ipynb | 1181 +++++++++++++++-- 2 files changed, 1057 insertions(+), 126 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 0f113b84c6..0a41447a53 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -34,7 +34,7 @@ "\n", "\n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 46c4955288..ae03813639 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -31,7 +31,7 @@ "
\n", - " \n", + " \n", " \"Colab Run in Colab\n", " \n", "
\n", "\n", " \n", @@ -118,14 +118,10 @@ "\n", "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", "\n", - "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,aiplatform.googleapis.com) to enable the following APIs:\n", "\n", " * BigQuery API\n", " * BigQuery Connection API\n", - " * Cloud Run API\n", - " * Artifact Registry API\n", - " * Cloud Build API\n", - " * Cloud Resource Manager API\n", " * Vertex AI API\n", "\n", "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." @@ -143,9 +139,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated property [core/project].\n" + ] + } + ], "source": [ "# set your project ID below\n", "PROJECT_ID = \"\" # @param {type:\"string\"}\n", @@ -166,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -232,87 +236,6 @@ "# auth.authenticate_user()" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Connect to Vertex AI\n", - "\n", - "In order to use PaLM2TextGenerator, we will need to set up a [cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from google.cloud import bigquery_connection_v1 as bq_connection\n", - "\n", - "CONN_NAME = \"bqdf-llm\"\n", - "\n", - "client = bq_connection.ConnectionServiceClient()\n", - "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}\"\n", - "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n", - "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n", - "\n", - "try:\n", - " request = client.get_connection(\n", - " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n", - " )\n", - " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n", - "except Exception:\n", - " connection = bq_connection.types.Connection(\n", - " {\"friendly_name\": CONN_NAME, \"cloud_resource\": cloud_resource_properties}\n", - " )\n", - " request = bq_connection.CreateConnectionRequest(\n", - " {\n", - " \"parent\": new_conn_parent,\n", - " \"connection_id\": CONN_NAME,\n", - " \"connection\": connection,\n", - " }\n", - " )\n", - " response = client.create_connection(request)\n", - " CONN_SERVICE_ACCOUNT = (\n", - " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n", - " )\n", - "print(CONN_SERVICE_ACCOUNT)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set permissions for the service account\n", - "\n", - "The resource connection service account requires certain project-level permissions:\n", - " - `roles/aiplatform.user` and `roles/bigquery.connectionUser`: These roles are required for the connection to create a model definition using the LLM model in Vertex AI ([documentation](https://cloud.google.com/bigquery/docs/generate-text#give_the_service_account_access)).\n", - " - `roles/run.invoker`: This role is required for the connection to have read-only access to Cloud Run services that back custom/remote functions ([documentation](https://cloud.google.com/bigquery/docs/remote-functions#grant_permission_on_function)).\n", - "\n", - "Set these permissions by running the following `gcloud` commands:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -336,12 +259,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Project Setup" + "BigQuery DataFrames setup" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "R7STCS8xB5d2" }, @@ -353,6 +276,14 @@ "bf.options.bigquery.location = REGION" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -365,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "zDSwoBo1CU3G" }, @@ -376,11 +307,101 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "tYDoaKgJChiq" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9f096761-e3b5-4d58-a9f7-485ced67afca is DONE. 2.3 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ee8fecb1-2e30-407d-9e2e-9e76061da9e7 is DONE. 2.3 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", - " \n", + " \n", " \"Colab Run in Colab\n", " \n", "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
consumer_complaint_narrative
0I signed a contract as a condition of employme...
1First, I want to disclose that XXXX and XXXX b...
2Frequent calls from Focused Receivables Manage...
3I recently contacted Enhanced Recovery Company...
4This began when I subscribed to XXXX XXXX inte...
\n", + "

5 rows × 1 columns

\n", + "[5 rows x 1 columns in total]" + ], + "text/plain": [ + " consumer_complaint_narrative\n", + "0 I signed a contract as a condition of employme...\n", + "1 First, I want to disclose that XXXX and XXXX b...\n", + "2 Frequent calls from Focused Receivables Manage...\n", + "3 I recently contacted Enhanced Recovery Company...\n", + "4 This began when I subscribed to XXXX XXXX inte...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "issues_df = input_df[[\"consumer_complaint_narrative\"]].dropna()\n", "issues_df.head(n=5) # View the first five complaints" @@ -391,12 +412,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Download 10000 complaints to use with PaLM2TextEmbeddingGenerator" + "Downsample DataFrame to 10,000 records for model training." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "OltYSUEcsSOW" }, @@ -418,11 +439,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "li38q8FzDDMu" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 52d2e961-7896-497c-8b03-ab7374737679 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from bigframes.ml.llm import PaLM2TextEmbeddingGenerator\n", "\n", @@ -431,11 +465,125 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "id": "cOuSOQ5FDewD" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job d093d51a-8eda-442f-80cd-568cb76e00b3 is DONE. 10.6 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 6419df65-3e96-41a7-a7b5-3d058e18763a is DONE. 80.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 917f09ea-c468-4363-a856-b1091e5f775f is DONE. 80.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5c9679e7-192c-40b5-a14b-edc0fa113eaa is DONE. 61.5 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text_embedding
422[-0.012013785541057587, 0.003669967409223318, ...
616[-0.014948881231248379, -0.04672442376613617, ...
833[-0.01951478235423565, -0.027120858430862427, ...
1370[-0.03140445053577423, -0.048797041177749634, ...
1430[-0.02244548313319683, -0.03336532413959503, 0...
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " text_embedding\n", + "422 [-0.012013785541057587, 0.003669967409223318, ...\n", + "616 [-0.014948881231248379, -0.04672442376613617, ...\n", + "833 [-0.01951478235423565, -0.027120858430862427, ...\n", + "1370 [-0.03140445053577423, -0.048797041177749634, ...\n", + "1430 [-0.02244548313319683, -0.03336532413959503, 0...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Will take ~3 minutes to compute the embeddings\n", "predicted_embeddings = model.predict(downsampled_issues_df)\n", @@ -445,14 +593,263 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "id": "4H_etYfsEOFP" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job ce9cb0f9-4b0d-40a1-81f3-d6e60dd6c684 is DONE. 160.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job aa692a30-5706-46ad-8029-faf2fac66234 is DONE. 72.2 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
consumer_complaint_narrativetext_embedding
2580664Hello, my name is XXXX XXXX, and I am writing ...[0.0003211698785889894, -0.01816680282354355, ...
1806973This is XXXX XXXX and I am submitting this com...[-0.009485247544944286, -0.025846892967820168,...
2055053XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX...[-0.010950954630970955, -0.0249345600605011, 0...
2515231When I reinvestigated my credit report, I real...[-0.009660656563937664, -0.05793113633990288, ...
2633049Checking my credit report XX/XX/2018 with all ...[-0.0022159104701131582, -0.03330004960298538,...
3117273I contacted TransUnion and spoke a credit rep ...[-0.015955328941345215, -0.006488671060651541,...
698814XXXX XXXX XXXX. makes daily calls to me cell c...[0.005397460889071226, -0.01276913657784462, 0...
267826Can we please reopen Case : XXXX? \n", + "\n", + "Wells Farg...[0.004065403249114752, -0.0005381882656365633,...
54019My rights under 15 USC 1681 have been violated...[0.013823015615344048, -0.02010691538453102, 0...
141050To whom it may concern : My personal informati...[0.008104532025754452, -0.01856449618935585, 0...
2962076I have had a CashApp account since last year, ...[-0.0003019514260813594, -0.03750108182430267,...
2481105that some of the information was erroneous. Th...[-0.014868081547319889, -0.0443895161151886, -...
431562I have disputed the referenced accounts to the...[-0.0020524838473647833, -0.04830990731716156,...
1953029On, XX/XX/22, I attempted to complete a transa...[-0.01599179394543171, -0.0074900356121361256,...
2395979Subject : XXXX XXXX XXXX compensation, refund,...[-0.0035950862802565098, -0.014652969315648079...
455524I paid off my mortgage on XX/XX/2019. The comp...[-0.01100730150938034, -0.03495829552412033, 0...
2155924This kind of account is placed as a charged of...[-0.028635455295443535, -0.028604287654161453,...
1069497This is one of many issues I have had with Wel...[0.008871790021657944, -0.028502725064754486, ...
3181689I have disputed this account with MONTEREY FIN...[-0.004721717908978462, -0.03673810139298439, ...
274268Lender is not updating my loan status in the V...[-0.009221495129168034, -0.0289347805082798, 0...
1671305XXXX is a peer to peer lending conmpany that u...[-0.02911308966577053, -0.01850792020559311, -...
886026( DISPUTE CODE - XXXX ) My personal informatio...[-0.007220877334475517, -0.016615957021713257,...
1044431I filed a complaint against PNC this year and ...[0.002848619595170021, -0.035117778927087784, ...
1938481I applied for a modification and was approved....[-0.03114932030439377, -0.0421406552195549, 0....
1987834Ive been Disputting my XXXX XXXX I opened this...[-0.009406660683453083, -0.020967338234186172,...
\n", + "

25 rows × 2 columns

\n", + "
[10000 rows x 2 columns in total]" + ], + "text/plain": [ + " consumer_complaint_narrative \\\n", + "2580664 Hello, my name is XXXX XXXX, and I am writing ... \n", + "1806973 This is XXXX XXXX and I am submitting this com... \n", + "2055053 XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... \n", + "2515231 When I reinvestigated my credit report, I real... \n", + "2633049 Checking my credit report XX/XX/2018 with all ... \n", + "3117273 I contacted TransUnion and spoke a credit rep ... \n", + "698814 XXXX XXXX XXXX. makes daily calls to me cell c... \n", + "267826 Can we please reopen Case : XXXX? \n", + "\n", + "Wells Farg... \n", + "54019 My rights under 15 USC 1681 have been violated... \n", + "141050 To whom it may concern : My personal informati... \n", + "2962076 I have had a CashApp account since last year, ... \n", + "2481105 that some of the information was erroneous. Th... \n", + "431562 I have disputed the referenced accounts to the... \n", + "1953029 On, XX/XX/22, I attempted to complete a transa... \n", + "2395979 Subject : XXXX XXXX XXXX compensation, refund,... \n", + "455524 I paid off my mortgage on XX/XX/2019. The comp... \n", + "2155924 This kind of account is placed as a charged of... \n", + "1069497 This is one of many issues I have had with Wel... \n", + "3181689 I have disputed this account with MONTEREY FIN... \n", + "274268 Lender is not updating my loan status in the V... \n", + "1671305 XXXX is a peer to peer lending conmpany that u... \n", + "886026 ( DISPUTE CODE - XXXX ) My personal informatio... \n", + "1044431 I filed a complaint against PNC this year and ... \n", + "1938481 I applied for a modification and was approved.... \n", + "1987834 Ive been Disputting my XXXX XXXX I opened this... \n", + "\n", + " text_embedding \n", + "2580664 [0.0003211698785889894, -0.01816680282354355, ... \n", + "1806973 [-0.009485247544944286, -0.025846892967820168,... \n", + "2055053 [-0.010950954630970955, -0.0249345600605011, 0... \n", + "2515231 [-0.009660656563937664, -0.05793113633990288, ... \n", + "2633049 [-0.0022159104701131582, -0.03330004960298538,... \n", + "3117273 [-0.015955328941345215, -0.006488671060651541,... \n", + "698814 [0.005397460889071226, -0.01276913657784462, 0... \n", + "267826 [0.004065403249114752, -0.0005381882656365633,... \n", + "54019 [0.013823015615344048, -0.02010691538453102, 0... \n", + "141050 [0.008104532025754452, -0.01856449618935585, 0... \n", + "2962076 [-0.0003019514260813594, -0.03750108182430267,... \n", + "2481105 [-0.014868081547319889, -0.0443895161151886, -... \n", + "431562 [-0.0020524838473647833, -0.04830990731716156,... \n", + "1953029 [-0.01599179394543171, -0.0074900356121361256,... \n", + "2395979 [-0.0035950862802565098, -0.014652969315648079... \n", + "455524 [-0.01100730150938034, -0.03495829552412033, 0... \n", + "2155924 [-0.028635455295443535, -0.028604287654161453,... \n", + "1069497 [0.008871790021657944, -0.028502725064754486, ... \n", + "3181689 [-0.004721717908978462, -0.03673810139298439, ... \n", + "274268 [-0.009221495129168034, -0.0289347805082798, 0... \n", + "1671305 [-0.02911308966577053, -0.01850792020559311, -... \n", + "886026 [-0.007220877334475517, -0.016615957021713257,... \n", + "1044431 [0.002848619595170021, -0.035117778927087784, ... \n", + "1938481 [-0.03114932030439377, -0.0421406552195549, 0.... \n", + "1987834 [-0.009406660683453083, -0.020967338234186172,... \n", + "...\n", + "\n", + "[10000 rows x 2 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Join the complaints with their embeddings in the same DataFrame\n", - "combined_df = downsampled_issues_df.join(predicted_embeddings)" + "combined_df = downsampled_issues_df.join(predicted_embeddings, how=\"left\")\n", + "combined_df" ] }, { @@ -470,12 +867,12 @@ "id": "OUZ3NNbzo1Tb" }, "source": [ - "## Step 2: KMeans clustering" + "## Step 2: Create k-means model and predict clusters" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "id": "AhNTnEC5FRz2" }, @@ -496,14 +893,152 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": { "id": "6poSxh-fGJF7" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 65eb317d-59f1-4d10-acd1-4b7f3778114c is DONE. 61.7 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 156e445e-cc01-4b30-84cc-ac1c98a69b81 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5befc212-f4a3-4e33-b1b2-01e809acdcbd is DONE. 61.9 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bd271178-8b8d-45dc-ac57-7f0194d0daac is DONE. 80.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bbfb9cca-622d-4bf5-9fc0-6d9a85287d41 is DONE. 80.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a5f30b32-9fb0-42b4-b426-d8484f008bdb is DONE. 160.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CENTROID_ID
4222
6163
8335
13707
14303
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " CENTROID_ID\n", + "422 2\n", + "616 3\n", + "833 5\n", + "1370 7\n", + "1430 3\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", + "cluster_model.fit(combined_df[\"text_embedding\"])\n", "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", @@ -512,12 +1047,123 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 7a41196e-ea67-44ac-95a7-7dce620d6d21 is DONE. 320.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8008b482-1a0d-461f-a215-4676d9d918dc is DONE. 72.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
consumer_complaint_narrativetext_embeddingCENTROID_ID
2580664Hello, my name is XXXX XXXX, and I am writing ...[0.0003211698785889894, -0.01816680282354355, ...2
1806973This is XXXX XXXX and I am submitting this com...[-0.009485247544944286, -0.025846892967820168,...5
2055053XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX...[-0.010950954630970955, -0.0249345600605011, 0...3
2515231When I reinvestigated my credit report, I real...[-0.009660656563937664, -0.05793113633990288, ...5
2633049Checking my credit report XX/XX/2018 with all ...[-0.0022159104701131582, -0.03330004960298538,...3
\n", + "

5 rows × 3 columns

\n", + "
[5 rows x 3 columns in total]" + ], + "text/plain": [ + " consumer_complaint_narrative \\\n", + "2580664 Hello, my name is XXXX XXXX, and I am writing ... \n", + "1806973 This is XXXX XXXX and I am submitting this com... \n", + "2055053 XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... \n", + "2515231 When I reinvestigated my credit report, I real... \n", + "2633049 Checking my credit report XX/XX/2018 with all ... \n", + "\n", + " text_embedding CENTROID_ID \n", + "2580664 [0.0003211698785889894, -0.01816680282354355, ... 2 \n", + "1806973 [-0.009485247544944286, -0.025846892967820168,... 5 \n", + "2055053 [-0.010950954630970955, -0.0249345600605011, 0... 3 \n", + "2515231 [-0.009660656563937664, -0.05793113633990288, ... 5 \n", + "2633049 [-0.0022159104701131582, -0.03330004960298538,... 3 \n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Join the group number to the complaints and their text embeddings\n", - "combined_clustered_result = combined_df.join(clustered_result)" + "combined_clustered_result = combined_df.join(clustered_result)\n", + "\n", + "combined_clustered_result.head(n=5)" ] }, { @@ -535,7 +1181,7 @@ "id": "21rNsFMHo8hO" }, "source": [ - "## Step 3: Summarize the complaints" + "## Step 3: Use PaLM2 LLM model to summarize complaint clusters" ] }, { @@ -548,11 +1194,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "id": "2E7wXM_jGqo6" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 50c7c0dd-94a2-494e-a37f-6a838a518f6c is DONE. 11.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d96c847f-c292-4804-bd05-fd643c41c7a5 is DONE. 11.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Using bigframes, with syntax identical to pandas,\n", "# filter out the first and second groups\n", @@ -569,11 +1240,100 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": { "id": "ZNDiueI9IP5e" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "comment list 1:\n", + "1. XXXX is a peer to peer lending conmpany that uses borrowers crypto to collateralize loans from investors ( like myself ). I've been investing with them for almost XXXX years and currently have {$240000.00} tied up in lending products with XXXX. \n", + "As of XXXX days ago we received an email saying all business operations have been ceased and no withdrawals or deposits will be allowed. They said they'll update customers within 10 days, but no one can reach anyone at the company to find out any more details as they are not answering calls nor returning emails. It also appears the company has scrubbed its XXXX page and the XXXX pages of top executives. \n", + "\n", + "All collateral and client 's investment funds are supposedly held at or processed through XXXX XXXX XXXX ( registered SEC company ). XXXX XXXX keeps telling us to contact XXXX and won't give us any information, so we have no way to find out what's happening with our funds/collateral or if everything is gone. We have a XXXX channel up where people are gathering evidence, documentation, etc. This is probably the best place to start to get a broad view of what's happening. Details below. \n", + "\n", + "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX CONST LLC ( Business ID : XXXX ) FoXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 'Cease of Operations ' email received by all investors XXXX XX/XX/2022 at XXXX : \" Dear XXXX Users, Given the collapses of several cryptocurrencies so far this year and the rapidly deteriorating market conditions that have been prompting heavy withdrawals across all XXXX lending and XXXX exchange platforms recently, we are sad to inform you that we are unable to continue to operate our business as usual. As such, we are limiting our business activities, including pausing user withdrawals as allowed under our Terms of XXXX. \n", + "No deposit or investment request will be processed at this time. \n", + "\n", + "Our team is working diligently towards our objective of maximizing value for all of our Users, and our top priority continues to be to protect your interests. As we explore all options available to us, we will provide updates to you as we go. \n", + "\n", + "We hope to communicate with you within the next XXXX business days on the next steps to address the situation. We appreciate your patience in this trying time. \n", + "\n", + "Sincerely yoursXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", + "2. Submitted XX/XX/XXXX\n", + "Typed XX/XX/XXXX:\n", + "\n", + "XX/XX/XXXX\n", + "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX XXXX\n", + "PH:. XXXX\n", + "PH: XXXX\n", + "EM:\n", + "XXXX\n", + "XXXX\n", + "XXXX XXXX \n", + "XXXX XXXX\n", + "Date of Birth XX/XX/XXXX\n", + "SS#: XXXX\n", + "TO:\n", + "* Consumer Financial Protection Brueau\n", + "* Department of Veteran Affairs, Office of the Inspector General\n", + "My name is XXXX XXXX XXXX, I've received more than one email from Discover Card in my XXXX XXXX, past emails from Discover Card were unautherized deletions.\n", + "From: Discover Card XXXX\n", + "To: You XXXX\n", + "Date: XX/XX/XXXX, XXXX XXXX XXXX From: Discover Card XXXX>\n", + "To Recipient \n", + "Date Mon, XX/XX/XXXX XXXX XXXX\n", + "I dont and havent ever had a Discover Checking, Savings, Business Accounts nor Loans of any kind through any Bank called Discover. The 1st time I was contacted by Discover Card I resided alone from XX/XX/XXXX to XX/XX/XXXXat XXXX XXXX XXXX at XXXX XXXX XXXX XXXX XXXX in XXXX, XXXX years prior to me moving here to XXXX, XXXX in XX/XX/XXXX. When \n", + "\n", + "\n", + "Discover Card had 1st contacted me in XXXX, XXXX it was associated with my XXXX XXXX XXXX website related online Merchants Account. Not once have I ever applied for or had any Website Merchant Accounts here in XXXX; I only applied for online online Merchant Accounts associated with my XXXX related Accounts I purchased while residing in XXXX, XXXX. Some of my website related information was stolen both in XXXX, XXXX and here in XXXX along with my other property that hasn't been returned to me. I don't and haven't ever had any XXXX XXXX related Agreements,Contracts or Credit Cards offered to Veterans associated with ones businesses. Nor have I ever applied for or had a Business License or Business Permit in any City or State inspite of my diverse interest. Not once have I ever allowed another be it an Paralegal, Payee, Attorney, Employers, Landlords, Veteran Organizations including Vocational Rehabilitation Programs, XXXX( XXXX XXXX XXXX, XXXX XXXX, Entertainment Companies, Banks, Celebrity Personal Assistant Agencies or Celebs, Shelters, Charities, HUD, Housing Arthority, Department of Veteran Affairs, Military, Law Enforcement or anyone else nor their employess to sign any business related Agreements or Contracts on my behalf; not even my family members or friends. \n", + "None of my XXXX XXXX attempts were associated with my Employers, Department of Veteran Affairs,Vocational Rehabilitation Programs Military, Landlords, HUD( Housing Authority),Friends, Family nor did I ever sign related Agreements or Contracts with them. Not once had I ever provided anyone the passwords to be able to sign into my accounts rather were aware of my accounts or not. Yes, my desktop computer that was stolen along with my other property XX/XX/XXXX was registered with my Online Merchant Account. I had paid for my Merchant related Accounts through my same XXXX XXXX XXXX Account I purchased both of my XXXX XXXX XXXX related accounts through. That was 1st once during the Summer of XX/XX/XXXX and 2nd my related website months later, while I resided in XXXX XXXX and I worked for XXXX. I never offered nor did I ever sign any business Contracts or Agreements with XXXX nor my Landlord or their staff associted with any of my online websites or Merchant Accounts. My XXXX XXXX XXXX Compensation was deposited into both of my XXXX XXXX XXXX Accounts at that time. My account was changed during the Summer of XX/XX/XXXXbecause of theft of my Bank Card. None of my Checking,Savings, past Credit Cards or Business related were shared accounts in which others were allowed to \n", + "use to make purchases. I had written checks from my XXXX XXXX XXXX account to pay for my XXXX XXXX XXXX XXXX on the XXXX XXXX here in XXXX in XX/XX/XXXX before it's name changed to XXXX XXXX. Prior to me using my same account open a Checking account in person at XXXX XXXX before it's name was changed to XXXX XXXX. Where my XXXX XXXX XXXX XXXX has been deposited since that time. I had used my XXXX XXXX Checking to pay for my XXXX XXXX XXXX XXXX both before theft of my property XX/XX/XXXX and that was also prior to the theft of my property from my XXXX XXXX XXXX XXXX in XX/XX/XXXX.\n", + "I've stated this many times:\n", + "I paid for my 1st XXXX XXXX XXXX Membership while employed at XXXX using my XXXX XXXX XXXX account XXXX my XXXX XXXX XXXX XXXX was also deposited. That was changed to XXXX because I didn't receive my 1st XXXX XXXX XXXX Card the bank sent to XXXX XXXX residence on XXXX XXXX in XX/XX/XXXX while I was there. In which both my XXXX salary and XXXX XXXX XXXX XXXX were deposited into my account, no money from XXXX XXXX nor anyone else that was at that residence was given to nor were any of my children there. Nor did XXXX or any other person at that residence ever give me my missing Bank Card not even after I moved out and stayed a month at XXXX XXXX XXXX using my replacement card to pay for my Hotel room. Which is the same account I used to pay for XXXX XXXX Membership, XXXX XXXX XXXX, XXXX XXXX Membership fees, and various online Merchant Account activation related fees.\n", + "* XXXX XXXX XXXX.\n", + "XXXX XXXX XXXX XXXX. Membership\n", + "\n", + "# XXXX\n", + "* XXXX XXXX Membership\n", + "# XXXX\n", + "* Total Merchant Services XXXX and XXXX.\n", + "* XXXX XXXX XXXX XXXX XXXX\n", + "* XXXX XXXX changed my $XXXX a month fees to my XXXX XXXX XXXX account #XXXX.\n", + "XX/XX/XXXX - XX/XX/XXXX XXXX XXXX, XXXX.\n", + "\n", + "Rep: XXXX XXXX XXXX, Fl \n", + "XXXX\n", + "XXXX Website \n", + "XXXX\n", + "Software and website owner, I performed Internet advertising and marketing, to promote this software and website. I worked and XXXX from my home XXXX XXXX XXXX XXXX XXXX , XXXX. I purchased XXXX XXXX XXXX-Software Electronic Book CD and was given a website to promote the software on the internet. The XXXX was given a copy of my website owner certificate document submitted to me when I purchased the software marketing program as well copies of my other school transcripts in addition to XXXX XXXX XXXX for example. XXXX, represented the first initials of my children's names. I wasn't ever paid and I'm still owed the money. Nor did my marketing program have anything to do with any schools, college nor university programs nor did I ever offer or sign any agreement to include it such. Nor did my XXXX XXXX XXXX have anything to do with any other employers, Department of Family and Children, Military, Veteran Organizations or Food Stamp programs, Section 8 nor Indianapolis Housing Authority for example; only me.\n", + "Thank you,\n", + "XXXX XXXX\n", + "3. ACCORDING TO 15 U.S. CODE 6803-DISCLOSURE OF INSTITUTION PRIVACY POLICY, AND ACCORDING TO U.S. CODE 6802- OBLIGATIONS WITH RESPECT TO DISCLOSURES OF PERSONAL INFORMATION. ( b ) OPT OUT ( 1 ) IN GENERAL A FINANCIAL INSTITUTION MAY NOT DISCLOSE NONPUBLIC PERSONAL INFORMATION TO A NONAFFILIATED THIRD PARTY ( TRANSUNION, XXXX, AND XXXX. ) UNLESS- ( A ) SUCH FINANCIAL INSTITUTION CLEARLY AND CONSPICUOUSLY DISCLOSES TO THE CONSUMER, IN WRITING OR IN ELECTRONIC FORM OR OTHER FORM PERMITTED BY THE REGULATIONS PRESCRIBED UNDER SECTION 6804 OF THIS TITLE. ALSO ACCORDING TO THE \" XXXX ACT '', FINANCIAL INSTITUTIONS MUST TELL THEIR CUSTOMERS ABOUT THEIR INFORMATION-SHARING PRACTICES AND EXPLAIN TO CUSTOMERS THEIR RIGHT TO \" OPT OUT '' IF THEY DON'T WANT THEIR INFORMATION SHARED WITH CERTAIN THIRD PARTIES. UNDER THE FDCPA, A COLLECTOR MUST PROVIDE YOU WITH INFORMATION ABOUT THE DEBT IN ITS INITIAL COMMUNICATION OR WITHIN FIVE DAYS AFTER THE INITIAL COMMUNICATION. ALSO, THE FDCPA STATES, \" YOU CAN NOT ATTEMPT TO COLLECT AN DEBT WHILE A PERSON ( THE CONSUMER ) SUPRESS VALIDATION. TRANSUNION, XXXX, XXXX, AND THE ACCOUNTS LISTED BELOW HAVE CLEARLY VIOLATED MY RIGHTS : XXXX ACCOUNT # XXXX, XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX ACCOUNT # XXXX, XXXX XXXX XXXX XXXX ACCOUNT # XXXX, AND XXXX ACCOUNT # XXXX. FAILURE TO RESPOND SATISFACTORILY WITH DELETIONS OF ALL THE ABOVE ACCOUNTS WILL RESULT IN LEGAL ACTIONS BEING TAKEN AGAINST, TRANSUNION, XXXX, XXXX, WHICH I'LL BE SEEKING A {$1000.00} PER VIOLATION FOR DEFAMATION OF CHARACTER ( PER SE ) NEGLIGENT ENABLEMENT OF IDENTITY FRAUD. 15 USC 1681 VIOLATIONS FOR WILLFUL NONCOMPLIANCE-616 CIVIL LIABILITY FOR WILLFUL NONCOPLIANCE. THIS IS THE THIRD TIME I'VE SUBMITTED A COMPLAINT, AND THE REPONSE I GET IS \" YOU CAN NOT LOCATE MY CREDIT REPORT! '' THIS IS CLEARLY NEGLIGENCE.\n", + "4. I do not know how this works, but I need it done or somehow corrected. My name is XXXX XXXX, XXXX XXXX XXXX XXXX TN XXXXMy SS XXXX DOB XXXX. I had some issues with my income being affected by the COVID-19PANDEMICSHUTDOWN. I was under the 1 CARESAct, Pub. L. 116-136, section 4021, codified at FCRAsection 623 ( a ) ( 1 ) ( F ) ( i ) ( I ), 15 U.S.C.1681s- 2 ( a ) ( 1 ) ( F ) ( i ) ( I ). I am requesting some accommodations so I care to protect the integrity of my credit file. US DEPT OF ED / XXXX # XXXX, # XXXX accounts are reporting on XXXX, XXXX The was 30,60, 90 DAYS LATEsince requested assistance due to the pandemic. I found a few accounts that I have never done any business with these companies and the accounts do not belong on my report : XXXX XXXX # XXXX, XXXX XXXX XXXX XXXX # XXXX. \n", + "\n", + "I have some issues with the misspelling of my name, my correct spelling is XXXX XXXX. Please remove any other variation of my name they are not correct. The following addresses do not belong to me please delete them : XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXXSC, XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", + "5. I want to know if this is even legal?! How can they disclose information without knowing its a correct email?!\n", + "\n", + "comment list 2:\n", + "1. Hello, my name is XXXX XXXX, and I am writing to delete the following information in my file. The items I need deleted are listed in the report. I am a victim of identity theft and did not make the charge. I ask that the items be deleted to correct my credit report. I reported the theft of my identity to the Federal Trade Commission and I also have enclosed copies of the Federal Trade Commissions Identity Theft Affidavit. Please delete the items as soon as possible. The accounts are being reported currently open and the accounts need to be closed. \n", + "XXXX account number XXXX opened on XX/XX/2022 for the amount {$530.00} XXXX XXXX XXXX account number XXXX opened on XX/XX/2022 for the amount of {$140.00} The accounts are being reported currently open and need to be closed immediately. \n", + "Based on, 15 U.S. Code 1681c2 a consumer reporting agency shall block the reporting of any information in the file of a consumer that the consumer identifies as information that resulted from an alleged identity theft, not later than 4 business days after the date of receipt. This account should not be furnished on my consumer report. As a consumer I am demanding the deletion of the accounts listed IMMEDIATELY.\n", + "2. To whom it may concern : My personal information was breach in the internet as result accounts had been open in my name, I was advise to fill out an Id theft report to help me deal with this situation, I have listed each one of the accounts that do not belong to me. This is my second request to remove unverified items in my report, but XXXX keep rposting these account with out providing any type of original document as the FCRA provide, you need to provide me with original documents or remove these account immediately.\n", + "3. Ive been Disputting my XXXX XXXX I opened this account and someone got my information and used my card, I contacted XXXX over and over, they removed the negative reporting from my XXXX report but still reporting it negative on my XXXX and Expean this is very unfair to me because Im a victim of identity theft\n", + "4. Today, XX/XX/2021, I received three items in the mail, one envelope containing an unsolicited debit card from Navy Federal credit Union and the other two, with a letter each describing The Important Rights on two accounts should these accounts become delinquent under New York law. \n", + "\n", + "First of all, I never applied for these accounts with Navy Federal, not have I authorized anyone to do so on my behalf. I immediately contacted Navy Federal via phone and was told I was most likely a victim of identity theft and that I should monitor my credit and use a credit monitoring service. I was also asked for my email and mailing information in order to receive a letter from them regarding this issue. \n", + "\n", + "My main concern is having someone using my identity to illegally open bank accounts and commit fraud, destroying my credit and finances in the process. This bank is in another state from where I reside. I have not lived in Virginia nor do I intend to do so in the foreseeable future.\n", + "5. My personal information ( including my SSN, Drivers License Info, Addresses, and more ) was stolen from a hacking, and Equifax did n't tell the public about the hack until more than a month after the hacking. During this time, three Equifax executives were caught inside trading. It really shows how Equifax cares about other people!\n", + "\n" + ] + } + ], "source": [ "# Build plain-text prompts to send to PaLM 2. Use only 5 complaints from each group.\n", "prompt1 = 'comment list 1:\\n'\n", @@ -592,11 +1352,100 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "id": "BfHGJLirzSvH" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please highlight the most obvious difference betweenthe two lists of comments:\n", + "comment list 1:\n", + "1. XXXX is a peer to peer lending conmpany that uses borrowers crypto to collateralize loans from investors ( like myself ). I've been investing with them for almost XXXX years and currently have {$240000.00} tied up in lending products with XXXX. \n", + "As of XXXX days ago we received an email saying all business operations have been ceased and no withdrawals or deposits will be allowed. They said they'll update customers within 10 days, but no one can reach anyone at the company to find out any more details as they are not answering calls nor returning emails. It also appears the company has scrubbed its XXXX page and the XXXX pages of top executives. \n", + "\n", + "All collateral and client 's investment funds are supposedly held at or processed through XXXX XXXX XXXX ( registered SEC company ). XXXX XXXX keeps telling us to contact XXXX and won't give us any information, so we have no way to find out what's happening with our funds/collateral or if everything is gone. We have a XXXX channel up where people are gathering evidence, documentation, etc. This is probably the best place to start to get a broad view of what's happening. Details below. \n", + "\n", + "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX CONST LLC ( Business ID : XXXX ) FoXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 'Cease of Operations ' email received by all investors XXXX XX/XX/2022 at XXXX : \" Dear XXXX Users, Given the collapses of several cryptocurrencies so far this year and the rapidly deteriorating market conditions that have been prompting heavy withdrawals across all XXXX lending and XXXX exchange platforms recently, we are sad to inform you that we are unable to continue to operate our business as usual. As such, we are limiting our business activities, including pausing user withdrawals as allowed under our Terms of XXXX. \n", + "No deposit or investment request will be processed at this time. \n", + "\n", + "Our team is working diligently towards our objective of maximizing value for all of our Users, and our top priority continues to be to protect your interests. As we explore all options available to us, we will provide updates to you as we go. \n", + "\n", + "We hope to communicate with you within the next XXXX business days on the next steps to address the situation. We appreciate your patience in this trying time. \n", + "\n", + "Sincerely yoursXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", + "2. Submitted XX/XX/XXXX\n", + "Typed XX/XX/XXXX:\n", + "\n", + "XX/XX/XXXX\n", + "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX XXXX\n", + "PH:. XXXX\n", + "PH: XXXX\n", + "EM:\n", + "XXXX\n", + "XXXX\n", + "XXXX XXXX \n", + "XXXX XXXX\n", + "Date of Birth XX/XX/XXXX\n", + "SS#: XXXX\n", + "TO:\n", + "* Consumer Financial Protection Brueau\n", + "* Department of Veteran Affairs, Office of the Inspector General\n", + "My name is XXXX XXXX XXXX, I've received more than one email from Discover Card in my XXXX XXXX, past emails from Discover Card were unautherized deletions.\n", + "From: Discover Card XXXX\n", + "To: You XXXX\n", + "Date: XX/XX/XXXX, XXXX XXXX XXXX From: Discover Card XXXX>\n", + "To Recipient \n", + "Date Mon, XX/XX/XXXX XXXX XXXX\n", + "I dont and havent ever had a Discover Checking, Savings, Business Accounts nor Loans of any kind through any Bank called Discover. The 1st time I was contacted by Discover Card I resided alone from XX/XX/XXXX to XX/XX/XXXXat XXXX XXXX XXXX at XXXX XXXX XXXX XXXX XXXX in XXXX, XXXX years prior to me moving here to XXXX, XXXX in XX/XX/XXXX. When \n", + "\n", + "\n", + "Discover Card had 1st contacted me in XXXX, XXXX it was associated with my XXXX XXXX XXXX website related online Merchants Account. Not once have I ever applied for or had any Website Merchant Accounts here in XXXX; I only applied for online online Merchant Accounts associated with my XXXX related Accounts I purchased while residing in XXXX, XXXX. Some of my website related information was stolen both in XXXX, XXXX and here in XXXX along with my other property that hasn't been returned to me. I don't and haven't ever had any XXXX XXXX related Agreements,Contracts or Credit Cards offered to Veterans associated with ones businesses. Nor have I ever applied for or had a Business License or Business Permit in any City or State inspite of my diverse interest. Not once have I ever allowed another be it an Paralegal, Payee, Attorney, Employers, Landlords, Veteran Organizations including Vocational Rehabilitation Programs, XXXX( XXXX XXXX XXXX, XXXX XXXX, Entertainment Companies, Banks, Celebrity Personal Assistant Agencies or Celebs, Shelters, Charities, HUD, Housing Arthority, Department of Veteran Affairs, Military, Law Enforcement or anyone else nor their employess to sign any business related Agreements or Contracts on my behalf; not even my family members or friends. \n", + "None of my XXXX XXXX attempts were associated with my Employers, Department of Veteran Affairs,Vocational Rehabilitation Programs Military, Landlords, HUD( Housing Authority),Friends, Family nor did I ever sign related Agreements or Contracts with them. Not once had I ever provided anyone the passwords to be able to sign into my accounts rather were aware of my accounts or not. Yes, my desktop computer that was stolen along with my other property XX/XX/XXXX was registered with my Online Merchant Account. I had paid for my Merchant related Accounts through my same XXXX XXXX XXXX Account I purchased both of my XXXX XXXX XXXX related accounts through. That was 1st once during the Summer of XX/XX/XXXX and 2nd my related website months later, while I resided in XXXX XXXX and I worked for XXXX. I never offered nor did I ever sign any business Contracts or Agreements with XXXX nor my Landlord or their staff associted with any of my online websites or Merchant Accounts. My XXXX XXXX XXXX Compensation was deposited into both of my XXXX XXXX XXXX Accounts at that time. My account was changed during the Summer of XX/XX/XXXXbecause of theft of my Bank Card. None of my Checking,Savings, past Credit Cards or Business related were shared accounts in which others were allowed to \n", + "use to make purchases. I had written checks from my XXXX XXXX XXXX account to pay for my XXXX XXXX XXXX XXXX on the XXXX XXXX here in XXXX in XX/XX/XXXX before it's name changed to XXXX XXXX. Prior to me using my same account open a Checking account in person at XXXX XXXX before it's name was changed to XXXX XXXX. Where my XXXX XXXX XXXX XXXX has been deposited since that time. I had used my XXXX XXXX Checking to pay for my XXXX XXXX XXXX XXXX both before theft of my property XX/XX/XXXX and that was also prior to the theft of my property from my XXXX XXXX XXXX XXXX in XX/XX/XXXX.\n", + "I've stated this many times:\n", + "I paid for my 1st XXXX XXXX XXXX Membership while employed at XXXX using my XXXX XXXX XXXX account XXXX my XXXX XXXX XXXX XXXX was also deposited. That was changed to XXXX because I didn't receive my 1st XXXX XXXX XXXX Card the bank sent to XXXX XXXX residence on XXXX XXXX in XX/XX/XXXX while I was there. In which both my XXXX salary and XXXX XXXX XXXX XXXX were deposited into my account, no money from XXXX XXXX nor anyone else that was at that residence was given to nor were any of my children there. Nor did XXXX or any other person at that residence ever give me my missing Bank Card not even after I moved out and stayed a month at XXXX XXXX XXXX using my replacement card to pay for my Hotel room. Which is the same account I used to pay for XXXX XXXX Membership, XXXX XXXX XXXX, XXXX XXXX Membership fees, and various online Merchant Account activation related fees.\n", + "* XXXX XXXX XXXX.\n", + "XXXX XXXX XXXX XXXX. Membership\n", + "\n", + "# XXXX\n", + "* XXXX XXXX Membership\n", + "# XXXX\n", + "* Total Merchant Services XXXX and XXXX.\n", + "* XXXX XXXX XXXX XXXX XXXX\n", + "* XXXX XXXX changed my $XXXX a month fees to my XXXX XXXX XXXX account #XXXX.\n", + "XX/XX/XXXX - XX/XX/XXXX XXXX XXXX, XXXX.\n", + "\n", + "Rep: XXXX XXXX XXXX, Fl \n", + "XXXX\n", + "XXXX Website \n", + "XXXX\n", + "Software and website owner, I performed Internet advertising and marketing, to promote this software and website. I worked and XXXX from my home XXXX XXXX XXXX XXXX XXXX , XXXX. I purchased XXXX XXXX XXXX-Software Electronic Book CD and was given a website to promote the software on the internet. The XXXX was given a copy of my website owner certificate document submitted to me when I purchased the software marketing program as well copies of my other school transcripts in addition to XXXX XXXX XXXX for example. XXXX, represented the first initials of my children's names. I wasn't ever paid and I'm still owed the money. Nor did my marketing program have anything to do with any schools, college nor university programs nor did I ever offer or sign any agreement to include it such. Nor did my XXXX XXXX XXXX have anything to do with any other employers, Department of Family and Children, Military, Veteran Organizations or Food Stamp programs, Section 8 nor Indianapolis Housing Authority for example; only me.\n", + "Thank you,\n", + "XXXX XXXX\n", + "3. ACCORDING TO 15 U.S. CODE 6803-DISCLOSURE OF INSTITUTION PRIVACY POLICY, AND ACCORDING TO U.S. CODE 6802- OBLIGATIONS WITH RESPECT TO DISCLOSURES OF PERSONAL INFORMATION. ( b ) OPT OUT ( 1 ) IN GENERAL A FINANCIAL INSTITUTION MAY NOT DISCLOSE NONPUBLIC PERSONAL INFORMATION TO A NONAFFILIATED THIRD PARTY ( TRANSUNION, XXXX, AND XXXX. ) UNLESS- ( A ) SUCH FINANCIAL INSTITUTION CLEARLY AND CONSPICUOUSLY DISCLOSES TO THE CONSUMER, IN WRITING OR IN ELECTRONIC FORM OR OTHER FORM PERMITTED BY THE REGULATIONS PRESCRIBED UNDER SECTION 6804 OF THIS TITLE. ALSO ACCORDING TO THE \" XXXX ACT '', FINANCIAL INSTITUTIONS MUST TELL THEIR CUSTOMERS ABOUT THEIR INFORMATION-SHARING PRACTICES AND EXPLAIN TO CUSTOMERS THEIR RIGHT TO \" OPT OUT '' IF THEY DON'T WANT THEIR INFORMATION SHARED WITH CERTAIN THIRD PARTIES. UNDER THE FDCPA, A COLLECTOR MUST PROVIDE YOU WITH INFORMATION ABOUT THE DEBT IN ITS INITIAL COMMUNICATION OR WITHIN FIVE DAYS AFTER THE INITIAL COMMUNICATION. ALSO, THE FDCPA STATES, \" YOU CAN NOT ATTEMPT TO COLLECT AN DEBT WHILE A PERSON ( THE CONSUMER ) SUPRESS VALIDATION. TRANSUNION, XXXX, XXXX, AND THE ACCOUNTS LISTED BELOW HAVE CLEARLY VIOLATED MY RIGHTS : XXXX ACCOUNT # XXXX, XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX ACCOUNT # XXXX, XXXX XXXX XXXX XXXX ACCOUNT # XXXX, AND XXXX ACCOUNT # XXXX. FAILURE TO RESPOND SATISFACTORILY WITH DELETIONS OF ALL THE ABOVE ACCOUNTS WILL RESULT IN LEGAL ACTIONS BEING TAKEN AGAINST, TRANSUNION, XXXX, XXXX, WHICH I'LL BE SEEKING A {$1000.00} PER VIOLATION FOR DEFAMATION OF CHARACTER ( PER SE ) NEGLIGENT ENABLEMENT OF IDENTITY FRAUD. 15 USC 1681 VIOLATIONS FOR WILLFUL NONCOMPLIANCE-616 CIVIL LIABILITY FOR WILLFUL NONCOPLIANCE. THIS IS THE THIRD TIME I'VE SUBMITTED A COMPLAINT, AND THE REPONSE I GET IS \" YOU CAN NOT LOCATE MY CREDIT REPORT! '' THIS IS CLEARLY NEGLIGENCE.\n", + "4. I do not know how this works, but I need it done or somehow corrected. My name is XXXX XXXX, XXXX XXXX XXXX XXXX TN XXXXMy SS XXXX DOB XXXX. I had some issues with my income being affected by the COVID-19PANDEMICSHUTDOWN. I was under the 1 CARESAct, Pub. L. 116-136, section 4021, codified at FCRAsection 623 ( a ) ( 1 ) ( F ) ( i ) ( I ), 15 U.S.C.1681s- 2 ( a ) ( 1 ) ( F ) ( i ) ( I ). I am requesting some accommodations so I care to protect the integrity of my credit file. US DEPT OF ED / XXXX # XXXX, # XXXX accounts are reporting on XXXX, XXXX The was 30,60, 90 DAYS LATEsince requested assistance due to the pandemic. I found a few accounts that I have never done any business with these companies and the accounts do not belong on my report : XXXX XXXX # XXXX, XXXX XXXX XXXX XXXX # XXXX. \n", + "\n", + "I have some issues with the misspelling of my name, my correct spelling is XXXX XXXX. Please remove any other variation of my name they are not correct. The following addresses do not belong to me please delete them : XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXXSC, XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", + "5. I want to know if this is even legal?! How can they disclose information without knowing its a correct email?!\n", + "comment list 2:\n", + "1. Hello, my name is XXXX XXXX, and I am writing to delete the following information in my file. The items I need deleted are listed in the report. I am a victim of identity theft and did not make the charge. I ask that the items be deleted to correct my credit report. I reported the theft of my identity to the Federal Trade Commission and I also have enclosed copies of the Federal Trade Commissions Identity Theft Affidavit. Please delete the items as soon as possible. The accounts are being reported currently open and the accounts need to be closed. \n", + "XXXX account number XXXX opened on XX/XX/2022 for the amount {$530.00} XXXX XXXX XXXX account number XXXX opened on XX/XX/2022 for the amount of {$140.00} The accounts are being reported currently open and need to be closed immediately. \n", + "Based on, 15 U.S. Code 1681c2 a consumer reporting agency shall block the reporting of any information in the file of a consumer that the consumer identifies as information that resulted from an alleged identity theft, not later than 4 business days after the date of receipt. This account should not be furnished on my consumer report. As a consumer I am demanding the deletion of the accounts listed IMMEDIATELY.\n", + "2. To whom it may concern : My personal information was breach in the internet as result accounts had been open in my name, I was advise to fill out an Id theft report to help me deal with this situation, I have listed each one of the accounts that do not belong to me. This is my second request to remove unverified items in my report, but XXXX keep rposting these account with out providing any type of original document as the FCRA provide, you need to provide me with original documents or remove these account immediately.\n", + "3. Ive been Disputting my XXXX XXXX I opened this account and someone got my information and used my card, I contacted XXXX over and over, they removed the negative reporting from my XXXX report but still reporting it negative on my XXXX and Expean this is very unfair to me because Im a victim of identity theft\n", + "4. Today, XX/XX/2021, I received three items in the mail, one envelope containing an unsolicited debit card from Navy Federal credit Union and the other two, with a letter each describing The Important Rights on two accounts should these accounts become delinquent under New York law. \n", + "\n", + "First of all, I never applied for these accounts with Navy Federal, not have I authorized anyone to do so on my behalf. I immediately contacted Navy Federal via phone and was told I was most likely a victim of identity theft and that I should monitor my credit and use a credit monitoring service. I was also asked for my email and mailing information in order to receive a letter from them regarding this issue. \n", + "\n", + "My main concern is having someone using my identity to illegally open bank accounts and commit fraud, destroying my credit and finances in the process. This bank is in another state from where I reside. I have not lived in Virginia nor do I intend to do so in the foreseeable future.\n", + "5. My personal information ( including my SSN, Drivers License Info, Addresses, and more ) was stolen from a hacking, and Equifax did n't tell the public about the hack until more than a month after the hacking. During this time, three Equifax executives were caught inside trading. It really shows how Equifax cares about other people!\n", + "\n" + ] + } + ], "source": [ "# The plain English request we will make of PaLM 2\n", "prompt = (\n", @@ -616,22 +1465,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": { "id": "mL5P0_3X04dE" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 66e3af22-91cb-400a-92c3-69e7cd12ee01 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from bigframes.ml.llm import PaLM2TextGenerator\n", "\n", +<<<<<<< HEAD + "q_a_model = PaLM2TextGenerator()" +======= + "# Create a BigQuery Cloud resource connection\n", + "CONN_NAME = \"bqdf-llm\"\n", "session = bf.get_global_session()\n", + "\n", "connection = f\"{PROJECT_ID}.{REGION}.{CONN_NAME}\"\n", "q_a_model = PaLM2TextGenerator(session=session, connection_name=connection)" +>>>>>>> origin/lmm-kmeans-notebook ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": { "id": "ICWHsqAW1FNk" }, @@ -643,11 +1512,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { "id": "gB7e1LXU1pst" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 653add17-29be-408c-8882-064217f8556e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8fd16954-853a-45fd-80bc-65b1242429e2 is DONE. 8 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d9929bcb-26ce-4844-b68e-f4a980b90ede is DONE. 171 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "' The first comment list is about people complaining about companies or services, while the second comment list is about people reporting identity theft or fraud.'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Send the request for PaLM 2 to generate a response to our prompt\n", "major_difference = q_a_model.predict(df)\n", @@ -662,6 +1578,21 @@ "source": [ "We now see PaLM2TextGenerator's characterization of the different comment groups. Thanks for using BigQuery DataFrames!" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary and next steps\n", + "\n", +<<<<<<< HEAD + "You've used the ML and LLM capabilities of BigQuery DataFrames to help analyze and understand a large dataset of unstructured feedback.\n", +======= + "You've used BigQuery DataFrames' integration with LLM models (`bigframes.ml.llm`) to generate code samples, and have tranformed LLM output by creating and using a custom function in BigQuery DataFrames.\n", +>>>>>>> origin/lmm-kmeans-notebook + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." + ] } ], "metadata": { @@ -682,7 +1613,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.13" } }, "nbformat": 4, From 416171a70d91d4a6b71622ba72685147ab7d6186 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:04:18 -0800 Subject: [PATCH 02/26] feat!: model.predict returns all the columns (#204) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/cluster.py | 4 +- bigframes/ml/decomposition.py | 9 +- bigframes/ml/ensemble.py | 49 +- bigframes/ml/forecasting.py | 9 +- bigframes/ml/imported.py | 22 +- bigframes/ml/linear_model.py | 24 +- bigframes/ml/llm.py | 17 +- .../getting_started/ml_fundamentals.ipynb | 3586 ++++++++++------- .../sklearn_linear_regression.ipynb | 1192 +++--- tests/system/large/ml/test_cluster.py | 4 +- tests/system/large/ml/test_ensemble.py | 2 +- tests/system/large/ml/test_pipeline.py | 4 +- tests/system/small/ml/test_cluster.py | 4 +- tests/system/small/ml/test_ensemble.py | 20 +- tests/system/small/ml/test_forecasting.py | 4 +- tests/system/small/ml/test_imported.py | 8 +- tests/system/small/ml/test_linear_model.py | 8 +- tests/system/small/ml/test_llm.py | 22 +- .../sklearn/cluster/_kmeans.py | 16 +- .../sklearn/linear_model/_base.py | 6 +- .../bigframes_vendored/xgboost/sklearn.py | 2 +- 21 files changed, 2737 insertions(+), 2275 deletions(-) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 772b90f666..c9f52ba0b6 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import cast, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union from google.cloud import bigquery @@ -92,7 +92,7 @@ def predict( (X,) = utils.convert_to_dataframe(X) - return cast(bpd.DataFrame, self._bqml_model.predict(X)[["CENTROID_ID"]]) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: """Save the model to BigQuery. diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 8e6be6d28c..7cda7a6993 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import cast, List, Optional, Union +from typing import List, Optional, Union from google.cloud import bigquery @@ -106,12 +106,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - return cast( - bpd.DataFrame, - self._bqml_model.predict(X)[ - ["principal_component_" + str(i + 1) for i in range(self.n_components)] - ], - ) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> PCA: """Save the model to BigQuery. diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 19ca8608ff..fcb3fe5343 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import cast, Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Union from google.cloud import bigquery @@ -168,16 +168,7 @@ def predict( raise RuntimeError("A model must be fitted before predict") (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -328,19 +319,9 @@ def _fit( def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -486,19 +467,9 @@ def predict( ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -661,19 +632,9 @@ def predict( ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8e309d5e73..cf23854fa0 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union from google.cloud import bigquery @@ -24,8 +24,6 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -_PREDICT_OUTPUT_COLUMNS = ["forecast_timestamp", "forecast_value"] - class ARIMAPlus(base.SupervisedTrainablePredictor): """Time Series ARIMA Plus model.""" @@ -100,10 +98,7 @@ def predict(self, X=None) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - return cast( - bpd.DataFrame, - self._bqml_model.forecast()[_PREDICT_OUTPUT_COLUMNS], - ) + return self._bqml_model.forecast() def score( self, diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index fb8aa98bef..f6afc9aa38 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -78,16 +78,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: """Save the model to BigQuery. @@ -161,16 +152,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: """Save the model to BigQuery. diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index f11879500b..433d9fbc38 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import cast, Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Union from google.cloud import bigquery @@ -145,16 +145,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -267,16 +258,7 @@ def predict( (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 3cfc28e61f..93e2ba825f 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -149,7 +149,8 @@ def predict( Returns: - bigframes.dataframe.DataFrame: Output DataFrame with only 1 column as the output text results.""" + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models if temperature < 0.0 or temperature > 1.0: @@ -181,11 +182,7 @@ def predict( "top_p": top_p, "flatten_json_output": True, } - df = self._bqml_model.generate_text(X, options) - return cast( - bpd.DataFrame, - df[[_TEXT_GENERATE_RESULT_COLUMN]], - ) + return self._bqml_model.generate_text(X, options) class PaLM2TextEmbeddingGenerator(base.Predictor): @@ -269,7 +266,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. Returns: - bigframes.dataframe.DataFrame: Output DataFrame with only 1 column as the output embedding results + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models @@ -287,8 +284,4 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: options = { "flatten_json_output": True, } - df = self._bqml_model.generate_text_embedding(X, options) - return cast( - bpd.DataFrame, - df[[_EMBED_TEXT_RESULT_COLUMN]], - ) + return self._bqml_model.generate_text_embedding(X, options) diff --git a/notebooks/getting_started/ml_fundamentals.ipynb b/notebooks/getting_started/ml_fundamentals.ipynb index 2f566dd704..165bd90f31 100644 --- a/notebooks/getting_started/ml_fundamentals.ipynb +++ b/notebooks/getting_started/ml_fundamentals.ipynb @@ -14,46 +14,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0c8a8bc0b4d64448aef68d6a98fae666", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 28e903c6-e874-4b99-8f53-0755e0b0c188 is RUNNING. " ] }, "metadata": {}, @@ -61,13 +31,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9680fd748e0546b4a010fda0155c5027", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e8aba858-7660-4274-8d90-8d2b0382f8f6 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 7950d6a7-3747-4454-bba2-9660e830647f is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -117,250 +85,250 @@ " \n", " 0\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", + " MALE\n", " \n", " \n", " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 2\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 47.4\n", + " 14.6\n", + " 212.0\n", + " 4725.0\n", + " FEMALE\n", " \n", " \n", " 3\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.5\n", - " 17.9\n", - " 192.0\n", - " 3500.0\n", + " 42.5\n", + " 16.7\n", + " 187.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", " 4\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", + " Biscoe\n", + " 43.2\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", + " MALE\n", " \n", " \n", " 5\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.7\n", + " 15.3\n", + " 219.0\n", + " 5200.0\n", " MALE\n", " \n", " \n", " 6\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 46.9\n", - " 16.6\n", - " 192.0\n", - " 2700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 7\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.5\n", - " 18.4\n", - " 200.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.2\n", + " 13.8\n", + " 215.0\n", + " 4750.0\n", " FEMALE\n", " \n", " \n", " 8\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 49.5\n", - " 19.0\n", - " 200.0\n", - " 3800.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.5\n", + " 13.5\n", + " 210.0\n", + " 4550.0\n", + " FEMALE\n", " \n", " \n", " 9\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.5\n", + " 15.2\n", + " 216.0\n", + " 5000.0\n", + " FEMALE\n", " \n", " \n", " 10\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.2\n", + " 15.6\n", + " 221.0\n", + " 5100.0\n", " MALE\n", " \n", " \n", " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", + " FEMALE\n", " \n", " \n", " 12\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.7\n", + " 15.0\n", + " 223.0\n", + " 5550.0\n", + " MALE\n", " \n", " \n", " 13\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 47.0\n", - " 17.3\n", - " 185.0\n", - " 3700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", " 14\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", " 15\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.7\n", + " 15.7\n", + " 208.0\n", + " 5350.0\n", + " MALE\n", " \n", " \n", " 16\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 45.7\n", - " 17.3\n", - " 193.0\n", - " 3600.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 17\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.6\n", - " 19.4\n", - " 193.0\n", - " 3800.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.8\n", + " 15.4\n", + " 215.0\n", + " 5150.0\n", " MALE\n", " \n", " \n", " 18\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 50.3\n", + " 20.0\n", + " 197.0\n", + " 3300.0\n", " MALE\n", " \n", " \n", " 19\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", " 20\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.6\n", - " 17.8\n", - " 193.0\n", - " 3800.0\n", - " FEMALE\n", + " 51.0\n", + " 18.8\n", + " 203.0\n", + " 4100.0\n", + " MALE\n", " \n", " \n", " 21\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 51.3\n", - " 19.2\n", - " 193.0\n", - " 3650.0\n", - " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", + " FEMALE\n", " \n", " \n", " 22\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.5\n", + " 13.9\n", + " 210.0\n", + " 4200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", " 24\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 49.6\n", - " 18.2\n", - " 193.0\n", + " 51.7\n", + " 20.3\n", + " 194.0\n", " 3775.0\n", " MALE\n", " \n", @@ -370,86 +338,86 @@ "[334 rows x 7 columns in total]" ], "text/plain": [ - " species island \\\n", - "penguin_id \n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + " species island \\\n", + "penguin_id \n", + "0 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "1 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", + "2 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + "4 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "5 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "6 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "7 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "8 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "10 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "11 Adelie Penguin (Pygoscelis adeliae) Dream \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "14 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "15 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "16 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "18 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + "19 Adelie Penguin (Pygoscelis adeliae) Dream \n", + "20 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + "21 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "23 Adelie Penguin (Pygoscelis adeliae) Dream \n", + "24 Chinstrap penguin (Pygoscelis antarctica) Dream \n", "\n", " culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", "penguin_id \n", - "0 36.6 18.4 184.0 3475.0 \n", - "1 39.8 19.1 184.0 4650.0 \n", - "2 40.9 18.9 184.0 3900.0 \n", - "3 46.5 17.9 192.0 3500.0 \n", - "4 37.3 16.8 192.0 3000.0 \n", - "5 43.2 18.5 192.0 4100.0 \n", - "6 46.9 16.6 192.0 2700.0 \n", - "7 50.5 18.4 200.0 3400.0 \n", - "8 49.5 19.0 200.0 3800.0 \n", - "9 40.2 20.1 200.0 3975.0 \n", - "10 40.8 18.9 208.0 4300.0 \n", - "11 39.0 18.7 185.0 3650.0 \n", - "12 37.0 16.9 185.0 3000.0 \n", - "13 47.0 17.3 185.0 3700.0 \n", - "14 34.0 17.1 185.0 3400.0 \n", - "15 37.0 16.5 185.0 3400.0 \n", - "16 45.7 17.3 193.0 3600.0 \n", - "17 50.6 19.4 193.0 3800.0 \n", - "18 39.7 17.9 193.0 4250.0 \n", - "19 37.8 18.1 193.0 3750.0 \n", - "20 46.6 17.8 193.0 3800.0 \n", - "21 51.3 19.2 193.0 3650.0 \n", - "22 40.2 17.1 193.0 3400.0 \n", - "23 36.8 18.5 193.0 3500.0 \n", - "24 49.6 18.2 193.0 3775.0 \n", + "0 40.1 18.9 188.0 4300.0 \n", + "1 39.1 18.7 181.0 3750.0 \n", + "2 47.4 14.6 212.0 4725.0 \n", + "3 42.5 16.7 187.0 3350.0 \n", + "4 43.2 19.0 197.0 4775.0 \n", + "5 46.7 15.3 219.0 5200.0 \n", + "6 41.3 21.1 195.0 4400.0 \n", + "7 45.2 13.8 215.0 4750.0 \n", + "8 46.5 13.5 210.0 4550.0 \n", + "9 50.5 15.2 216.0 5000.0 \n", + "10 48.2 15.6 221.0 5100.0 \n", + "11 38.1 18.6 190.0 3700.0 \n", + "12 50.7 15.0 223.0 5550.0 \n", + "13 37.8 20.0 190.0 4250.0 \n", + "14 35.0 17.9 190.0 3450.0 \n", + "15 48.7 15.7 208.0 5350.0 \n", + "16 34.6 21.1 198.0 4400.0 \n", + "17 46.8 15.4 215.0 5150.0 \n", + "18 50.3 20.0 197.0 3300.0 \n", + "19 37.2 18.1 178.0 3900.0 \n", + "20 51.0 18.8 203.0 4100.0 \n", + "21 40.5 17.9 187.0 3200.0 \n", + "22 45.5 13.9 210.0 4200.0 \n", + "23 42.2 18.5 180.0 3550.0 \n", + "24 51.7 20.3 194.0 3775.0 \n", "\n", " sex \n", "penguin_id \n", - "0 FEMALE \n", + "0 MALE \n", "1 MALE \n", - "2 MALE \n", + "2 FEMALE \n", "3 FEMALE \n", - "4 FEMALE \n", + "4 MALE \n", "5 MALE \n", - "6 FEMALE \n", + "6 MALE \n", "7 FEMALE \n", - "8 MALE \n", - "9 MALE \n", + "8 FEMALE \n", + "9 FEMALE \n", "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "13 FEMALE \n", + "11 FEMALE \n", + "12 MALE \n", + "13 MALE \n", "14 FEMALE \n", - "15 FEMALE \n", - "16 FEMALE \n", + "15 MALE \n", + "16 MALE \n", "17 MALE \n", "18 MALE \n", "19 MALE \n", - "20 FEMALE \n", - "21 MALE \n", + "20 MALE \n", + "21 FEMALE \n", "22 FEMALE \n", "23 FEMALE \n", "24 MALE \n", @@ -458,7 +426,7 @@ "[334 rows x 7 columns]" ] }, - "execution_count": 18, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -490,18 +458,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "171160f246eb43d1832aeefb055c0851", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job deda90a8-6ec7-419c-8067-e85777bd916f is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1408053d-cb80-4870-af28-e94b90a20a6d is DONE. 28.9 kB processed. " ] }, "metadata": {}, @@ -509,13 +475,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "eaffac40f94745728e6bd618bebd2c53", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job efe8fa0a-d450-475a-99d5-36beeb985247 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 262885fe-973c-4338-a853-227f9db4835a is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -523,13 +487,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "68e7ecdc639f4d3ab482830bf6a9da04", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 5022c56d-e605-4cab-be1b-1ecf189588a1 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job fb1dc831-7f6f-42ce-96da-1292d73919b4 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -537,13 +499,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ebfe197fd88348129ebe2f7d288bf4b9", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 175bd293-d448-4510-b926-1d8cfb4eb5e7 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e79add79-f1e4-4cf0-bb97-04d153222f19 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -551,13 +511,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2ae69ea7da5247e8a1f7cd0e049629cb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job a3a2e68c-f5f3-4237-99ad-44974f29d090 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job cb5ee343-f86e-4795-b0ce-d58854e72e5c is RUNNING. " ] }, "metadata": {}, @@ -596,18 +554,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5ed4206cd3ad4cd485315605bf033df2", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job db3365fb-67ca-44cc-a117-88a80dc63cca is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e65af31c-feda-468d-89c9-dec033574640 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -615,13 +571,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ac72db21945542558fdd62093d9dc0c3", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ab78f7ab-a115-448b-92d0-19c091a831ca is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 0455f252-2b94-457e-bad5-672b91d9b51f is RUNNING. " ] }, "metadata": {}, @@ -667,47 +621,47 @@ " \n", " \n", " \n", - " 156\n", - " Biscoe\n", - " 46.2\n", - " 14.5\n", - " 209.0\n", - " FEMALE\n", - " Gentoo penguin (Pygoscelis papua)\n", + " 249\n", + " Torgersen\n", + " 41.1\n", + " 18.6\n", + " 189.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 189\n", + " 36\n", " Biscoe\n", - " 35.3\n", - " 18.9\n", - " 187.0\n", + " 43.4\n", + " 14.4\n", + " 218.0\n", " FEMALE\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 279\n", + " 74\n", " Biscoe\n", - " 45.1\n", - " 14.5\n", - " 215.0\n", + " 42.8\n", + " 14.2\n", + " 209.0\n", " FEMALE\n", " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 245\n", - " Biscoe\n", - " 49.5\n", - " 16.2\n", - " 229.0\n", - " MALE\n", - " Gentoo penguin (Pygoscelis papua)\n", + " 235\n", + " Dream\n", + " 34.0\n", + " 17.1\n", + " 185.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 343\n", - " Torgersen\n", - " 37.3\n", - " 20.5\n", - " 199.0\n", + " 117\n", + " Dream\n", + " 37.8\n", + " 18.1\n", + " 193.0\n", " MALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", " \n", @@ -719,24 +673,24 @@ "text/plain": [ " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", "penguin_id \n", - "156 Biscoe 46.2 14.5 209.0 \n", - "189 Biscoe 35.3 18.9 187.0 \n", - "279 Biscoe 45.1 14.5 215.0 \n", - "245 Biscoe 49.5 16.2 229.0 \n", - "343 Torgersen 37.3 20.5 199.0 \n", + "249 Torgersen 41.1 18.6 189.0 \n", + "36 Biscoe 43.4 14.4 218.0 \n", + "74 Biscoe 42.8 14.2 209.0 \n", + "235 Dream 34.0 17.1 185.0 \n", + "117 Dream 37.8 18.1 193.0 \n", "\n", " sex species \n", "penguin_id \n", - "156 FEMALE Gentoo penguin (Pygoscelis papua) \n", - "189 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "279 FEMALE Gentoo penguin (Pygoscelis papua) \n", - "245 MALE Gentoo penguin (Pygoscelis papua) \n", - "343 MALE Adelie Penguin (Pygoscelis adeliae) \n", + "249 MALE Adelie Penguin (Pygoscelis adeliae) \n", + "36 FEMALE Gentoo penguin (Pygoscelis papua) \n", + "74 FEMALE Gentoo penguin (Pygoscelis papua) \n", + "235 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", + "117 MALE Adelie Penguin (Pygoscelis adeliae) \n", "\n", "[5 rows x 6 columns]" ] }, - "execution_count": 20, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -749,18 +703,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d6dd794f89724099950dcc927d63d0f5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 22a72cad-11a6-4f8e-b16d-f92853b8112e is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d5a173bd-a7dc-42fa-8468-b088d47ccfe0 is RUNNING. " ] }, "metadata": {}, @@ -768,13 +720,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a8ab7ca12e0d43a6803483480e837c6e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job bc952727-8806-4fe2-abf2-c3a8a2bd9b6d is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c6b6518b-2689-4dc1-a5b0-2a9ab75301eb is RUNNING. " ] }, "metadata": {}, @@ -810,24 +760,24 @@ " \n", " \n", " \n", - " 156\n", - " 4800.0\n", + " 249\n", + " 3325.0\n", " \n", " \n", - " 189\n", - " 3800.0\n", + " 36\n", + " 4600.0\n", " \n", " \n", - " 279\n", - " 5000.0\n", + " 74\n", + " 4700.0\n", " \n", " \n", - " 245\n", - " 5800.0\n", + " 235\n", + " 3400.0\n", " \n", " \n", - " 343\n", - " 3775.0\n", + " 117\n", + " 3750.0\n", " \n", " \n", "\n", @@ -837,16 +787,16 @@ "text/plain": [ " body_mass_g\n", "penguin_id \n", - "156 4800.0\n", - "189 3800.0\n", - "279 5000.0\n", - "245 5800.0\n", - "343 3775.0\n", + "249 3325.0\n", + "36 4600.0\n", + "74 4700.0\n", + "235 3400.0\n", + "117 3750.0\n", "\n", "[5 rows x 1 columns]" ] }, - "execution_count": 21, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -880,18 +830,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "380c57dc3fe54fbd8ad2fb23f1e66e37", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job f239341e-785f-43e1-bfe0-683132d6f15f is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 03a0eb1c-747e-4c2a-b7b5-d3e4e5a78134 is RUNNING. " ] }, "metadata": {}, @@ -899,13 +847,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3db47aadba854beca71960d846838dc4", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2d5bbbb9-efc4-4f4e-a8dc-2c7b66b0e5e0 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 70608c84-dac8-4e77-8a9e-00d823b24f37 is RUNNING. " ] }, "metadata": {}, @@ -913,13 +859,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1de81f2944a44cbda3f16fa8a1fae813", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 66120e1c-2471-4a0c-8b82-aeb189c8866a is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d18fdc32-2152-45d3-8c62-bf9b1556ec47 is RUNNING. " ] }, "metadata": {}, @@ -927,13 +871,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b06cae61a4534388a4e9ed26ce442cc2", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 62825fc4-5b77-43e5-a3e4-525ebfd1285b is DONE. 2.1 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 2a022682-535f-4dc0-80ba-1640306ad9ef is RUNNING. " ] }, "metadata": {}, @@ -941,13 +883,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "977c8eae2c9848e98c5478c41af82633", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 656d1d69-b4ff-4db6-9f2d-28dcf91e2fd7 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c145b39d-7d02-4394-80f0-fc605b2ba256 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -955,13 +895,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aefc3085fee04c438d0327d400b4b72a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 466507c8-1474-4725-93e5-baf8ee292e39 is DONE. 8.5 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job fc156a2b-db95-44a3-9ad1-d95b9d290080 is RUNNING. " ] }, "metadata": {}, @@ -1002,153 +940,153 @@ " \n", " \n", " 0\n", - " -1.344188\n", - " 0.642519\n", - " -1.193942\n", - " \n", - " \n", - " 1\n", - " -0.750047\n", - " 1.005876\n", - " -1.193942\n", + " -0.750505\n", + " 0.84903\n", + " -0.937262\n", " \n", " \n", " 2\n", - " -0.545811\n", - " 0.90206\n", - " -1.193942\n", + " 0.622496\n", + " -1.322402\n", + " 0.804051\n", " \n", " \n", - " 4\n", - " -1.214219\n", - " -0.188011\n", - " -0.619171\n", + " 3\n", + " -0.299107\n", + " -0.261935\n", + " -1.009817\n", " \n", " \n", " 5\n", - " -0.118772\n", - " 0.694427\n", - " -0.619171\n", + " 0.490839\n", + " -0.968913\n", + " 1.311935\n", " \n", " \n", " 6\n", - " 0.568203\n", - " -0.291828\n", - " -0.619171\n", + " -0.524806\n", + " 1.959995\n", + " -0.429379\n", " \n", " \n", " 7\n", - " 1.236611\n", - " 0.642519\n", - " -0.044401\n", + " 0.208715\n", + " -1.726389\n", + " 1.021716\n", " \n", " \n", " 9\n", - " -0.675779\n", - " 1.524957\n", - " -0.044401\n", + " 1.205551\n", + " -1.019412\n", + " 1.09427\n", " \n", " \n", " 10\n", - " -0.564378\n", - " 0.90206\n", - " 0.530369\n", - " \n", - " \n", - " 11\n", - " -0.898582\n", - " 0.798243\n", - " -1.122096\n", + " 0.772962\n", + " -0.817418\n", + " 1.457044\n", " \n", " \n", " 12\n", - " -1.26992\n", - " -0.136103\n", - " -1.122096\n", - " \n", - " \n", - " 13\n", - " 0.58677\n", - " 0.071529\n", - " -1.122096\n", + " 1.243168\n", + " -1.120408\n", + " 1.602153\n", " \n", " \n", " 14\n", - " -1.826927\n", - " -0.032287\n", - " -1.122096\n", + " -1.709725\n", + " 0.344046\n", + " -0.792152\n", " \n", " \n", - " 15\n", - " -1.26992\n", - " -0.343736\n", - " -1.122096\n", - " \n", - " \n", - " 16\n", - " 0.3454\n", - " 0.071529\n", - " -0.547325\n", + " 17\n", + " 0.509647\n", + " -0.918415\n", + " 1.021716\n", " \n", " \n", " 18\n", - " -0.768614\n", - " 0.382978\n", - " -0.547325\n", + " 1.167935\n", + " 1.404513\n", + " -0.284269\n", " \n", " \n", " 19\n", - " -1.121385\n", - " 0.486795\n", - " -0.547325\n", + " -1.295944\n", + " 0.445043\n", + " -1.662809\n", " \n", " \n", " 20\n", - " 0.512502\n", - " 0.33107\n", - " -0.547325\n", + " 1.299593\n", + " 0.798532\n", + " 0.151059\n", " \n", " \n", " 21\n", - " 1.385146\n", - " 1.057784\n", - " -0.547325\n", + " -0.675272\n", + " 0.344046\n", + " -1.009817\n", " \n", " \n", " 22\n", - " -0.675779\n", - " -0.032287\n", - " -0.547325\n", + " 0.26514\n", + " -1.675891\n", + " 0.658942\n", " \n", " \n", " 24\n", - " 1.069509\n", - " 0.538703\n", - " -0.547325\n", + " 1.43125\n", + " 1.556008\n", + " -0.501934\n", + " \n", + " \n", + " 25\n", + " 0.302756\n", + " 0.041055\n", + " -0.574488\n", " \n", " \n", " 26\n", - " -0.43441\n", - " 0.694427\n", - " 0.027445\n", + " 0.302756\n", + " -1.675891\n", + " 0.949161\n", + " \n", + " \n", + " 27\n", + " 0.227523\n", + " -1.776888\n", + " 0.658942\n", " \n", " \n", " 28\n", - " 1.923586\n", - " 1.888314\n", - " 0.027445\n", + " 1.318401\n", + " -0.362932\n", + " 1.747263\n", + " \n", + " \n", + " 29\n", + " 2.202388\n", + " 1.303516\n", + " 0.441278\n", " \n", " \n", " 30\n", - " 1.292312\n", - " 0.694427\n", - " 0.027445\n", + " -0.919779\n", + " 1.959995\n", + " -0.356824\n", " \n", " \n", " 31\n", - " -1.994029\n", - " -0.551368\n", - " -1.62502\n", + " 1.036277\n", + " -0.615424\n", + " 1.747263\n", + " \n", + " \n", + " 32\n", + " -0.223874\n", + " 0.19255\n", + " -0.356824\n", " \n", " \n", "\n", @@ -1158,65 +1096,65 @@ "text/plain": [ " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", "penguin_id \n", - "0 -1.344188 0.642519 \n", - "1 -0.750047 1.005876 \n", - "2 -0.545811 0.90206 \n", - "4 -1.214219 -0.188011 \n", - "5 -0.118772 0.694427 \n", - "6 0.568203 -0.291828 \n", - "7 1.236611 0.642519 \n", - "9 -0.675779 1.524957 \n", - "10 -0.564378 0.90206 \n", - "11 -0.898582 0.798243 \n", - "12 -1.26992 -0.136103 \n", - "13 0.58677 0.071529 \n", - "14 -1.826927 -0.032287 \n", - "15 -1.26992 -0.343736 \n", - "16 0.3454 0.071529 \n", - "18 -0.768614 0.382978 \n", - "19 -1.121385 0.486795 \n", - "20 0.512502 0.33107 \n", - "21 1.385146 1.057784 \n", - "22 -0.675779 -0.032287 \n", - "24 1.069509 0.538703 \n", - "26 -0.43441 0.694427 \n", - "28 1.923586 1.888314 \n", - "30 1.292312 0.694427 \n", - "31 -1.994029 -0.551368 \n", + "0 -0.750505 0.84903 \n", + "2 0.622496 -1.322402 \n", + "3 -0.299107 -0.261935 \n", + "5 0.490839 -0.968913 \n", + "6 -0.524806 1.959995 \n", + "7 0.208715 -1.726389 \n", + "9 1.205551 -1.019412 \n", + "10 0.772962 -0.817418 \n", + "12 1.243168 -1.120408 \n", + "14 -1.709725 0.344046 \n", + "17 0.509647 -0.918415 \n", + "18 1.167935 1.404513 \n", + "19 -1.295944 0.445043 \n", + "20 1.299593 0.798532 \n", + "21 -0.675272 0.344046 \n", + "22 0.26514 -1.675891 \n", + "24 1.43125 1.556008 \n", + "25 0.302756 0.041055 \n", + "26 0.302756 -1.675891 \n", + "27 0.227523 -1.776888 \n", + "28 1.318401 -0.362932 \n", + "29 2.202388 1.303516 \n", + "30 -0.919779 1.959995 \n", + "31 1.036277 -0.615424 \n", + "32 -0.223874 0.19255 \n", "\n", " standard_scaled_flipper_length_mm \n", "penguin_id \n", - "0 -1.193942 \n", - "1 -1.193942 \n", - "2 -1.193942 \n", - "4 -0.619171 \n", - "5 -0.619171 \n", - "6 -0.619171 \n", - "7 -0.044401 \n", - "9 -0.044401 \n", - "10 0.530369 \n", - "11 -1.122096 \n", - "12 -1.122096 \n", - "13 -1.122096 \n", - "14 -1.122096 \n", - "15 -1.122096 \n", - "16 -0.547325 \n", - "18 -0.547325 \n", - "19 -0.547325 \n", - "20 -0.547325 \n", - "21 -0.547325 \n", - "22 -0.547325 \n", - "24 -0.547325 \n", - "26 0.027445 \n", - "28 0.027445 \n", - "30 0.027445 \n", - "31 -1.62502 \n", + "0 -0.937262 \n", + "2 0.804051 \n", + "3 -1.009817 \n", + "5 1.311935 \n", + "6 -0.429379 \n", + "7 1.021716 \n", + "9 1.09427 \n", + "10 1.457044 \n", + "12 1.602153 \n", + "14 -0.792152 \n", + "17 1.021716 \n", + "18 -0.284269 \n", + "19 -1.662809 \n", + "20 0.151059 \n", + "21 -1.009817 \n", + "22 0.658942 \n", + "24 -0.501934 \n", + "25 -0.574488 \n", + "26 0.949161 \n", + "27 0.658942 \n", + "28 1.747263 \n", + "29 0.441278 \n", + "30 -0.356824 \n", + "31 1.747263 \n", + "32 -0.356824 \n", "...\n", "\n", "[267 rows x 3 columns]" ] }, - "execution_count": 22, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -1237,32 +1175,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "74f3c24c0a434e12bf6a56dc4809b501", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job c6268b07-0d3d-4fe0-971d-cc99fd98cd7e is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 31550d88-fc7b-4fcb-9975-9ed24bf2e009 is RUNNING. " ] }, "metadata": {}, @@ -1270,13 +1192,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5a04e46a7d0248b1ae523f2ca6903ee8", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1e17f5f7-2956-4bdd-baa9-c07591481341 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 5ec7c8b1-037c-466c-a51e-963f8274e76b is RUNNING. " ] }, "metadata": {}, @@ -1284,13 +1204,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "62563820bfb245be85bbc1bf3dfb993c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e2fde7a6-67b4-45a4-91d4-1cb9eff66ae5 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4e860716-bc41-4ef6-83ff-310d085ed7cc is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1298,13 +1216,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "98aff3bfded44868bf120451c89df9f5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e0683619-23c5-44fd-8930-9d3c9d02729a is DONE. 2.1 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6b96a757-42fe-4b65-92fd-a3ae339fe769 is RUNNING. " ] }, "metadata": {}, @@ -1344,154 +1260,154 @@ " \n", " \n", " \n", - " 3\n", - " 0.493935\n", - " 0.382978\n", - " -0.619171\n", - " \n", - " \n", - " 8\n", - " 1.050942\n", - " 0.953968\n", - " -0.044401\n", + " 1\n", + " -0.938587\n", + " 0.748033\n", + " -1.445145\n", " \n", " \n", - " 17\n", - " 1.255178\n", - " 1.1616\n", - " -0.547325\n", + " 4\n", + " -0.16745\n", + " 0.899528\n", + " -0.284269\n", " \n", " \n", - " 23\n", - " -1.307054\n", - " 0.694427\n", - " -0.547325\n", + " 8\n", + " 0.453222\n", + " -1.877885\n", + " 0.658942\n", " \n", " \n", - " 25\n", - " 1.515114\n", - " 0.486795\n", - " 0.027445\n", + " 11\n", + " -1.12667\n", + " 0.697535\n", + " -0.792152\n", " \n", " \n", - " 27\n", - " 1.236611\n", - " 1.265417\n", - " 0.027445\n", + " 13\n", + " -1.183094\n", + " 1.404513\n", + " -0.792152\n", " \n", " \n", - " 29\n", - " 1.403713\n", - " 0.953968\n", - " 0.027445\n", + " 15\n", + " 0.867003\n", + " -0.766919\n", + " 0.513833\n", " \n", " \n", - " 34\n", - " 0.419668\n", - " 0.538703\n", - " -1.62502\n", + " 16\n", + " -1.784958\n", + " 1.959995\n", + " -0.211715\n", " \n", " \n", - " 35\n", - " -1.455589\n", - " 0.694427\n", - " -1.050249\n", + " 23\n", + " -0.355532\n", + " 0.647036\n", + " -1.5177\n", " \n", " \n", - " 39\n", - " 0.326833\n", - " 1.1616\n", - " -0.475479\n", + " 34\n", + " -0.600039\n", + " -1.776888\n", + " 0.949161\n", " \n", " \n", - " 51\n", - " -1.065684\n", - " 0.227254\n", - " -0.978403\n", + " 36\n", + " -0.129833\n", + " -1.423399\n", + " 1.23938\n", " \n", " \n", - " 52\n", - " -0.248741\n", - " 0.071529\n", - " -0.978403\n", + " 42\n", + " -1.615684\n", + " -0.514427\n", + " -0.429379\n", " \n", " \n", - " 60\n", - " 0.531069\n", - " 0.382978\n", - " -0.403633\n", + " 48\n", + " 0.415606\n", + " -0.716421\n", + " 1.021716\n", " \n", " \n", " 61\n", - " 0.401101\n", - " 0.90206\n", - " -0.403633\n", + " 0.396797\n", + " -1.170907\n", + " 1.457044\n", " \n", " \n", " 64\n", - " -1.455589\n", - " 0.33107\n", - " -0.403633\n", + " 0.434414\n", + " -1.120408\n", + " 1.09427\n", " \n", " \n", " 65\n", - " -0.564378\n", - " 0.642519\n", - " -0.403633\n", + " -1.220711\n", + " 1.051024\n", + " -1.445145\n", " \n", " \n", - " 67\n", - " 1.273745\n", - " 1.317325\n", - " 0.171138\n", + " 68\n", + " -1.484026\n", + " -0.009443\n", + " -1.009817\n", " \n", " \n", - " 83\n", - " 2.629128\n", - " 0.33107\n", - " -1.409481\n", + " 70\n", + " 1.638141\n", + " 1.404513\n", + " 0.296168\n", " \n", " \n", - " 85\n", - " -1.288487\n", - " 0.746335\n", - " -0.83471\n", + " 72\n", + " 0.829387\n", + " 0.142052\n", + " -0.719598\n", " \n", " \n", - " 93\n", - " -0.508677\n", - " 0.486795\n", - " 0.314831\n", + " 74\n", + " -0.242683\n", + " -1.524396\n", + " 0.586387\n", " \n", " \n", - " 104\n", - " 0.382534\n", - " -0.032287\n", - " -0.762864\n", + " 77\n", + " -1.277136\n", + " -0.211437\n", + " -0.647043\n", " \n", " \n", - " 105\n", - " -1.065684\n", - " 0.746335\n", - " -0.762864\n", + " 81\n", + " 0.208715\n", + " -1.221405\n", + " 0.804051\n", + " \n", + " \n", + " 91\n", + " 1.261976\n", + " 0.647036\n", + " 0.005949\n", " \n", " \n", - " 108\n", - " 1.162343\n", - " 0.382978\n", - " -0.762864\n", + " 96\n", + " 0.246331\n", + " -1.322402\n", + " 0.731497\n", " \n", " \n", - " 113\n", - " 1.496547\n", - " 1.213509\n", - " 0.386677\n", + " 105\n", + " -1.803766\n", + " 0.445043\n", + " -1.009817\n", " \n", " \n", - " 130\n", - " -0.341575\n", - " 1.213509\n", - " -0.044401\n", + " 111\n", + " -1.164286\n", + " 0.697535\n", + " -2.098138\n", " \n", " \n", "\n", @@ -1501,65 +1417,65 @@ "text/plain": [ " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", "penguin_id \n", - "3 0.493935 0.382978 \n", - "8 1.050942 0.953968 \n", - "17 1.255178 1.1616 \n", - "23 -1.307054 0.694427 \n", - "25 1.515114 0.486795 \n", - "27 1.236611 1.265417 \n", - "29 1.403713 0.953968 \n", - "34 0.419668 0.538703 \n", - "35 -1.455589 0.694427 \n", - "39 0.326833 1.1616 \n", - "51 -1.065684 0.227254 \n", - "52 -0.248741 0.071529 \n", - "60 0.531069 0.382978 \n", - "61 0.401101 0.90206 \n", - "64 -1.455589 0.33107 \n", - "65 -0.564378 0.642519 \n", - "67 1.273745 1.317325 \n", - "83 2.629128 0.33107 \n", - "85 -1.288487 0.746335 \n", - "93 -0.508677 0.486795 \n", - "104 0.382534 -0.032287 \n", - "105 -1.065684 0.746335 \n", - "108 1.162343 0.382978 \n", - "113 1.496547 1.213509 \n", - "130 -0.341575 1.213509 \n", + "1 -0.938587 0.748033 \n", + "4 -0.16745 0.899528 \n", + "8 0.453222 -1.877885 \n", + "11 -1.12667 0.697535 \n", + "13 -1.183094 1.404513 \n", + "15 0.867003 -0.766919 \n", + "16 -1.784958 1.959995 \n", + "23 -0.355532 0.647036 \n", + "34 -0.600039 -1.776888 \n", + "36 -0.129833 -1.423399 \n", + "42 -1.615684 -0.514427 \n", + "48 0.415606 -0.716421 \n", + "61 0.396797 -1.170907 \n", + "64 0.434414 -1.120408 \n", + "65 -1.220711 1.051024 \n", + "68 -1.484026 -0.009443 \n", + "70 1.638141 1.404513 \n", + "72 0.829387 0.142052 \n", + "74 -0.242683 -1.524396 \n", + "77 -1.277136 -0.211437 \n", + "81 0.208715 -1.221405 \n", + "91 1.261976 0.647036 \n", + "96 0.246331 -1.322402 \n", + "105 -1.803766 0.445043 \n", + "111 -1.164286 0.697535 \n", "\n", " standard_scaled_flipper_length_mm \n", "penguin_id \n", - "3 -0.619171 \n", - "8 -0.044401 \n", - "17 -0.547325 \n", - "23 -0.547325 \n", - "25 0.027445 \n", - "27 0.027445 \n", - "29 0.027445 \n", - "34 -1.62502 \n", - "35 -1.050249 \n", - "39 -0.475479 \n", - "51 -0.978403 \n", - "52 -0.978403 \n", - "60 -0.403633 \n", - "61 -0.403633 \n", - "64 -0.403633 \n", - "65 -0.403633 \n", - "67 0.171138 \n", - "83 -1.409481 \n", - "85 -0.83471 \n", - "93 0.314831 \n", - "104 -0.762864 \n", - "105 -0.762864 \n", - "108 -0.762864 \n", - "113 0.386677 \n", - "130 -0.044401 \n", + "1 -1.445145 \n", + "4 -0.284269 \n", + "8 0.658942 \n", + "11 -0.792152 \n", + "13 -0.792152 \n", + "15 0.513833 \n", + "16 -0.211715 \n", + "23 -1.5177 \n", + "34 0.949161 \n", + "36 1.23938 \n", + "42 -0.429379 \n", + "48 1.021716 \n", + "61 1.457044 \n", + "64 1.09427 \n", + "65 -1.445145 \n", + "68 -1.009817 \n", + "70 0.296168 \n", + "72 -0.719598 \n", + "74 0.586387 \n", + "77 -0.647043 \n", + "81 0.804051 \n", + "91 0.005949 \n", + "96 0.731497 \n", + "105 -1.009817 \n", + "111 -2.098138 \n", "...\n", "\n", "[67 rows x 3 columns]" ] }, - "execution_count": 23, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1581,32 +1497,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d642a617d27f4e2493c80dbdd1686193", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job a8d8afa4-d91e-487e-8709-8727a73ab453 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job b9afd624-4345-4160-8809-05786563ce35 is RUNNING. " ] }, "metadata": {}, @@ -1614,13 +1514,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "09217776c2294e8b929a56e7a73fbfa8", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 41962e2e-4d14-4053-9297-3ce61699551a is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c918fc7c-a956-4259-b5c5-09c2eac615cd is RUNNING. " ] }, "metadata": {}, @@ -1628,13 +1526,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9c1581fc9fcb49739d1d81b73506b894", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 5d3c22c9-c972-4213-8557-726c9e0aca37 is DONE. 22.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1d855341-282f-4d10-9ba9-3ce6683b729a is RUNNING. " ] }, "metadata": {}, @@ -1642,13 +1538,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7749eb7cf554697a60c90f3718ad582", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 9cb7b33f-ea05-4cf4-9f92-bb3aa4ea8d10 is DONE. 2.1 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c257ff78-3e15-4296-82f5-ba6c2eb6a6ff is RUNNING. " ] }, "metadata": {}, @@ -1656,13 +1550,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e900465918224249bccc781d992aadbb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job fe1f35d6-d82c-4aab-a284-637b72554f5b is DONE. 29.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job a17eec0c-10d0-4943-95be-60fced57d5cb is RUNNING. " ] }, "metadata": {}, @@ -1670,13 +1562,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b0272ee35c5745a491b7c5883b3fbb1b", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 37bc90ff-59cb-4b0c-8f9d-73bcda43524a is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1db53c8a-cf45-4c69-a443-6b7a49fc3a07 is DONE. 536 Bytes processed. " ] }, "metadata": {}, @@ -1684,13 +1574,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "00f9d4b55bb94997aaebdae298cefab3", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e23f4724-fdd8-45a9-8c87-defd8d471035 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ae870ee3-e633-4556-94e6-6669fa0bfde2 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1698,13 +1586,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9cd8e791be5844669cba10dc53f862ae", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 257378db-0569-42d7-965a-7757154c710b is DONE. 21.4 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job baa7c70c-eabc-49e1-bae9-fdd4891cdb6e is RUNNING. " ] }, "metadata": {}, @@ -1751,226 +1637,226 @@ " \n", " \n", " 0\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.344188\n", - " 0.642519\n", - " -1.193942\n", - " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", - " \n", - " \n", - " 1\n", + " -0.750505\n", + " 0.84903\n", + " -0.937262\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.750047\n", - " 1.005876\n", - " -1.193942\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 2\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.545811\n", - " 0.90206\n", - " -1.193942\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", + " 0.622496\n", + " -1.322402\n", + " 0.804051\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 4\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.214219\n", - " -0.188011\n", - " -0.619171\n", + " 3\n", " [{'index': 2, 'value': 1.0}]\n", + " -0.299107\n", + " -0.261935\n", + " -1.009817\n", " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 5\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.490839\n", + " -0.968913\n", + " 1.311935\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.118772\n", - " 0.694427\n", - " -0.619171\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 6\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.524806\n", + " 1.959995\n", + " -0.429379\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.568203\n", - " -0.291828\n", - " -0.619171\n", - " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 7\n", - " [{'index': 2, 'value': 1.0}]\n", - " 1.236611\n", - " 0.642519\n", - " -0.044401\n", - " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.208715\n", + " -1.726389\n", + " 1.021716\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 9\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.675779\n", - " 1.524957\n", - " -0.044401\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", + " 1.205551\n", + " -1.019412\n", + " 1.09427\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 10\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.772962\n", + " -0.817418\n", + " 1.457044\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.564378\n", - " 0.90206\n", - " 0.530369\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 11\n", + " 12\n", + " [{'index': 1, 'value': 1.0}]\n", + " 1.243168\n", + " -1.120408\n", + " 1.602153\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.898582\n", - " 0.798243\n", - " -1.122096\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 12\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.26992\n", - " -0.136103\n", - " -1.122096\n", - " [{'index': 2, 'value': 1.0}]\n", + " 14\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.709725\n", + " 0.344046\n", + " -0.792152\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 13\n", - " [{'index': 2, 'value': 1.0}]\n", - " 0.58677\n", - " 0.071529\n", - " -1.122096\n", - " [{'index': 2, 'value': 1.0}]\n", + " 17\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.509647\n", + " -0.918415\n", + " 1.021716\n", " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 14\n", + " 18\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.167935\n", + " 1.404513\n", + " -0.284269\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.826927\n", - " -0.032287\n", - " -1.122096\n", " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 15\n", + " 19\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.26992\n", - " -0.343736\n", - " -1.122096\n", + " -1.295944\n", + " 0.445043\n", + " -1.662809\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 16\n", + " 20\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.3454\n", - " 0.071529\n", - " -0.547325\n", + " 1.299593\n", + " 0.798532\n", + " 0.151059\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 18\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.768614\n", - " 0.382978\n", - " -0.547325\n", - " [{'index': 3, 'value': 1.0}]\n", + " 21\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.675272\n", + " 0.344046\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 19\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.121385\n", - " 0.486795\n", - " -0.547325\n", - " [{'index': 3, 'value': 1.0}]\n", + " 22\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.26514\n", + " -1.675891\n", + " 0.658942\n", " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 20\n", + " 24\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.512502\n", - " 0.33107\n", - " -0.547325\n", + " 1.43125\n", + " 1.556008\n", + " -0.501934\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 21\n", + " 25\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.385146\n", - " 1.057784\n", - " -0.547325\n", - " [{'index': 3, 'value': 1.0}]\n", + " 0.302756\n", + " 0.041055\n", + " -0.574488\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 22\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.675779\n", - " -0.032287\n", - " -0.547325\n", - " [{'index': 2, 'value': 1.0}]\n", + " 26\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.302756\n", + " -1.675891\n", + " 0.949161\n", " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 24\n", - " [{'index': 2, 'value': 1.0}]\n", - " 1.069509\n", - " 0.538703\n", - " -0.547325\n", + " 27\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.227523\n", + " -1.776888\n", + " 0.658942\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 26\n", + " 28\n", + " [{'index': 1, 'value': 1.0}]\n", + " 1.318401\n", + " -0.362932\n", + " 1.747263\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.43441\n", - " 0.694427\n", - " 0.027445\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 28\n", + " 29\n", + " [{'index': 2, 'value': 1.0}]\n", + " 2.202388\n", + " 1.303516\n", + " 0.441278\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.923586\n", - " 1.888314\n", - " 0.027445\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 30\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.292312\n", - " 0.694427\n", - " 0.027445\n", - " [{'index': 3, 'value': 1.0}]\n", + " -0.919779\n", + " 1.959995\n", + " -0.356824\n", " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 31\n", + " [{'index': 1, 'value': 1.0}]\n", + " 1.036277\n", + " -0.615424\n", + " 1.747263\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.994029\n", - " -0.551368\n", - " -1.62502\n", + " [{'index': 3, 'value': 1.0}]\n", + " \n", + " \n", + " 32\n", + " [{'index': 3, 'value': 1.0}]\n", + " -0.223874\n", + " 0.19255\n", + " -0.356824\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", @@ -1982,121 +1868,121 @@ "text/plain": [ " onehotencoded_island standard_scaled_culmen_length_mm \\\n", "penguin_id \n", - "0 [{'index': 2, 'value': 1.0}] -1.344188 \n", - "1 [{'index': 2, 'value': 1.0}] -0.750047 \n", - "2 [{'index': 2, 'value': 1.0}] -0.545811 \n", - "4 [{'index': 2, 'value': 1.0}] -1.214219 \n", - "5 [{'index': 2, 'value': 1.0}] -0.118772 \n", - "6 [{'index': 2, 'value': 1.0}] 0.568203 \n", - "7 [{'index': 2, 'value': 1.0}] 1.236611 \n", - "9 [{'index': 2, 'value': 1.0}] -0.675779 \n", - "10 [{'index': 2, 'value': 1.0}] -0.564378 \n", - "11 [{'index': 2, 'value': 1.0}] -0.898582 \n", - "12 [{'index': 2, 'value': 1.0}] -1.26992 \n", - "13 [{'index': 2, 'value': 1.0}] 0.58677 \n", - "14 [{'index': 2, 'value': 1.0}] -1.826927 \n", - "15 [{'index': 2, 'value': 1.0}] -1.26992 \n", - "16 [{'index': 2, 'value': 1.0}] 0.3454 \n", - "18 [{'index': 2, 'value': 1.0}] -0.768614 \n", - "19 [{'index': 2, 'value': 1.0}] -1.121385 \n", - "20 [{'index': 2, 'value': 1.0}] 0.512502 \n", - "21 [{'index': 2, 'value': 1.0}] 1.385146 \n", - "22 [{'index': 2, 'value': 1.0}] -0.675779 \n", - "24 [{'index': 2, 'value': 1.0}] 1.069509 \n", - "26 [{'index': 2, 'value': 1.0}] -0.43441 \n", - "28 [{'index': 2, 'value': 1.0}] 1.923586 \n", - "30 [{'index': 2, 'value': 1.0}] 1.292312 \n", - "31 [{'index': 2, 'value': 1.0}] -1.994029 \n", + "0 [{'index': 1, 'value': 1.0}] -0.750505 \n", + "2 [{'index': 1, 'value': 1.0}] 0.622496 \n", + "3 [{'index': 2, 'value': 1.0}] -0.299107 \n", + "5 [{'index': 1, 'value': 1.0}] 0.490839 \n", + "6 [{'index': 1, 'value': 1.0}] -0.524806 \n", + "7 [{'index': 1, 'value': 1.0}] 0.208715 \n", + "9 [{'index': 1, 'value': 1.0}] 1.205551 \n", + "10 [{'index': 1, 'value': 1.0}] 0.772962 \n", + "12 [{'index': 1, 'value': 1.0}] 1.243168 \n", + "14 [{'index': 1, 'value': 1.0}] -1.709725 \n", + "17 [{'index': 1, 'value': 1.0}] 0.509647 \n", + "18 [{'index': 2, 'value': 1.0}] 1.167935 \n", + "19 [{'index': 2, 'value': 1.0}] -1.295944 \n", + "20 [{'index': 2, 'value': 1.0}] 1.299593 \n", + "21 [{'index': 1, 'value': 1.0}] -0.675272 \n", + "22 [{'index': 1, 'value': 1.0}] 0.26514 \n", + "24 [{'index': 2, 'value': 1.0}] 1.43125 \n", + "25 [{'index': 2, 'value': 1.0}] 0.302756 \n", + "26 [{'index': 1, 'value': 1.0}] 0.302756 \n", + "27 [{'index': 1, 'value': 1.0}] 0.227523 \n", + "28 [{'index': 1, 'value': 1.0}] 1.318401 \n", + "29 [{'index': 2, 'value': 1.0}] 2.202388 \n", + "30 [{'index': 2, 'value': 1.0}] -0.919779 \n", + "31 [{'index': 1, 'value': 1.0}] 1.036277 \n", + "32 [{'index': 3, 'value': 1.0}] -0.223874 \n", "\n", " standard_scaled_culmen_depth_mm \\\n", "penguin_id \n", - "0 0.642519 \n", - "1 1.005876 \n", - "2 0.90206 \n", - "4 -0.188011 \n", - "5 0.694427 \n", - "6 -0.291828 \n", - "7 0.642519 \n", - "9 1.524957 \n", - "10 0.90206 \n", - "11 0.798243 \n", - "12 -0.136103 \n", - "13 0.071529 \n", - "14 -0.032287 \n", - "15 -0.343736 \n", - "16 0.071529 \n", - "18 0.382978 \n", - "19 0.486795 \n", - "20 0.33107 \n", - "21 1.057784 \n", - "22 -0.032287 \n", - "24 0.538703 \n", - "26 0.694427 \n", - "28 1.888314 \n", - "30 0.694427 \n", - "31 -0.551368 \n", + "0 0.84903 \n", + "2 -1.322402 \n", + "3 -0.261935 \n", + "5 -0.968913 \n", + "6 1.959995 \n", + "7 -1.726389 \n", + "9 -1.019412 \n", + "10 -0.817418 \n", + "12 -1.120408 \n", + "14 0.344046 \n", + "17 -0.918415 \n", + "18 1.404513 \n", + "19 0.445043 \n", + "20 0.798532 \n", + "21 0.344046 \n", + "22 -1.675891 \n", + "24 1.556008 \n", + "25 0.041055 \n", + "26 -1.675891 \n", + "27 -1.776888 \n", + "28 -0.362932 \n", + "29 1.303516 \n", + "30 1.959995 \n", + "31 -0.615424 \n", + "32 0.19255 \n", "\n", " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", "penguin_id \n", - "0 -1.193942 [{'index': 2, 'value': 1.0}] \n", - "1 -1.193942 [{'index': 3, 'value': 1.0}] \n", - "2 -1.193942 [{'index': 3, 'value': 1.0}] \n", - "4 -0.619171 [{'index': 2, 'value': 1.0}] \n", - "5 -0.619171 [{'index': 3, 'value': 1.0}] \n", - "6 -0.619171 [{'index': 2, 'value': 1.0}] \n", - "7 -0.044401 [{'index': 2, 'value': 1.0}] \n", - "9 -0.044401 [{'index': 3, 'value': 1.0}] \n", - "10 0.530369 [{'index': 3, 'value': 1.0}] \n", - "11 -1.122096 [{'index': 3, 'value': 1.0}] \n", - "12 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "13 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "14 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "15 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "16 -0.547325 [{'index': 2, 'value': 1.0}] \n", - "18 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "19 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "20 -0.547325 [{'index': 2, 'value': 1.0}] \n", - "21 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "22 -0.547325 [{'index': 2, 'value': 1.0}] \n", - "24 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "26 0.027445 [{'index': 3, 'value': 1.0}] \n", - "28 0.027445 [{'index': 3, 'value': 1.0}] \n", - "30 0.027445 [{'index': 3, 'value': 1.0}] \n", - "31 -1.62502 [{'index': 2, 'value': 1.0}] \n", + "0 -0.937262 [{'index': 2, 'value': 1.0}] \n", + "2 0.804051 [{'index': 1, 'value': 1.0}] \n", + "3 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "5 1.311935 [{'index': 2, 'value': 1.0}] \n", + "6 -0.429379 [{'index': 2, 'value': 1.0}] \n", + "7 1.021716 [{'index': 1, 'value': 1.0}] \n", + "9 1.09427 [{'index': 1, 'value': 1.0}] \n", + "10 1.457044 [{'index': 2, 'value': 1.0}] \n", + "12 1.602153 [{'index': 2, 'value': 1.0}] \n", + "14 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "17 1.021716 [{'index': 2, 'value': 1.0}] \n", + "18 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "19 -1.662809 [{'index': 2, 'value': 1.0}] \n", + "20 0.151059 [{'index': 2, 'value': 1.0}] \n", + "21 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "22 0.658942 [{'index': 1, 'value': 1.0}] \n", + "24 -0.501934 [{'index': 2, 'value': 1.0}] \n", + "25 -0.574488 [{'index': 1, 'value': 1.0}] \n", + "26 0.949161 [{'index': 1, 'value': 1.0}] \n", + "27 0.658942 [{'index': 1, 'value': 1.0}] \n", + "28 1.747263 [{'index': 2, 'value': 1.0}] \n", + "29 0.441278 [{'index': 2, 'value': 1.0}] \n", + "30 -0.356824 [{'index': 2, 'value': 1.0}] \n", + "31 1.747263 [{'index': 2, 'value': 1.0}] \n", + "32 -0.356824 [{'index': 2, 'value': 1.0}] \n", "\n", " onehotencoded_species \n", "penguin_id \n", "0 [{'index': 1, 'value': 1.0}] \n", - "1 [{'index': 1, 'value': 1.0}] \n", - "2 [{'index': 1, 'value': 1.0}] \n", - "4 [{'index': 1, 'value': 1.0}] \n", - "5 [{'index': 1, 'value': 1.0}] \n", - "6 [{'index': 2, 'value': 1.0}] \n", - "7 [{'index': 2, 'value': 1.0}] \n", - "9 [{'index': 1, 'value': 1.0}] \n", - "10 [{'index': 1, 'value': 1.0}] \n", - "11 [{'index': 1, 'value': 1.0}] \n", - "12 [{'index': 1, 'value': 1.0}] \n", - "13 [{'index': 2, 'value': 1.0}] \n", + "2 [{'index': 3, 'value': 1.0}] \n", + "3 [{'index': 2, 'value': 1.0}] \n", + "5 [{'index': 3, 'value': 1.0}] \n", + "6 [{'index': 1, 'value': 1.0}] \n", + "7 [{'index': 3, 'value': 1.0}] \n", + "9 [{'index': 3, 'value': 1.0}] \n", + "10 [{'index': 3, 'value': 1.0}] \n", + "12 [{'index': 3, 'value': 1.0}] \n", "14 [{'index': 1, 'value': 1.0}] \n", - "15 [{'index': 1, 'value': 1.0}] \n", - "16 [{'index': 2, 'value': 1.0}] \n", - "18 [{'index': 1, 'value': 1.0}] \n", + "17 [{'index': 3, 'value': 1.0}] \n", + "18 [{'index': 2, 'value': 1.0}] \n", "19 [{'index': 1, 'value': 1.0}] \n", "20 [{'index': 2, 'value': 1.0}] \n", - "21 [{'index': 2, 'value': 1.0}] \n", - "22 [{'index': 1, 'value': 1.0}] \n", + "21 [{'index': 1, 'value': 1.0}] \n", + "22 [{'index': 3, 'value': 1.0}] \n", "24 [{'index': 2, 'value': 1.0}] \n", - "26 [{'index': 1, 'value': 1.0}] \n", - "28 [{'index': 2, 'value': 1.0}] \n", - "30 [{'index': 2, 'value': 1.0}] \n", - "31 [{'index': 1, 'value': 1.0}] \n", + "25 [{'index': 2, 'value': 1.0}] \n", + "26 [{'index': 3, 'value': 1.0}] \n", + "27 [{'index': 3, 'value': 1.0}] \n", + "28 [{'index': 3, 'value': 1.0}] \n", + "29 [{'index': 2, 'value': 1.0}] \n", + "30 [{'index': 1, 'value': 1.0}] \n", + "31 [{'index': 3, 'value': 1.0}] \n", + "32 [{'index': 1, 'value': 1.0}] \n", "...\n", "\n", "[267 rows x 6 columns]" ] }, - "execution_count": 24, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -2138,18 +2024,28 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5db4c5c80ba4417db151aa561dab5ee7", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 7d9c9f8b-6b4c-451f-ae3d-06fb7090d148 is DONE. 21.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job be87ccfa-72ab-4858-9d4a-b2f5f8b2a5e6 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ceced0cc-13a7-4b14-b42c-4d5f69e7e49a is RUNNING. " ] }, "metadata": {}, @@ -2157,13 +2053,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e6b05d83de0e496d9e47392762046fc5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2d651fac-11bf-42da-8c18-bd33207379ca is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job a708b8df-6040-49b1-a6da-d2c0d162f247 is RUNNING. " ] }, "metadata": {}, @@ -2171,13 +2065,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "24d616c24a844abfbfd77ebd9f28486a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 58836ccc-242b-4574-bc48-4c269e74dbf1 is DONE. 5.7 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e9b9cbb5-f6a4-4d85-ba78-1edae77dce94 is RUNNING. " ] }, "metadata": {}, @@ -2185,13 +2077,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ce49b66c6fa0460aa3ee28746765b6ac", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1bf531f0-0fde-489b-ab36-6040a2a12377 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6c0a41a7-a732-413a-b074-ba82f175eab8 is RUNNING. " ] }, "metadata": {}, @@ -2199,13 +2089,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a6010d73ca04ea9a133de99aa90da3c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 4245f4e6-4d5b-404f-81d7-50f0553e2456 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 2d08b79d-9c36-4db7-824a-332fdd02e9fc is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2213,13 +2101,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ce9cfdca964a4062a52ebaae9d13ae59", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ed951699-c005-450e-a8b6-0916ec234e7f is DONE. 5.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 7fa0bf53-1022-45ee-b3ac-78fa5c155585 is RUNNING. " ] }, "metadata": {}, @@ -2247,152 +2133,397 @@ " \n", " \n", " predicted_body_mass_g\n", + " onehotencoded_island\n", + " standard_scaled_culmen_length_mm\n", + " standard_scaled_culmen_depth_mm\n", + " standard_scaled_flipper_length_mm\n", + " onehotencoded_sex\n", + " onehotencoded_species\n", " \n", " \n", " penguin_id\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", - " 3394.118128\n", + " 1\n", + " 3781.402407\n", + " [{'index': 3, 'value': 1.0}]\n", + " -0.938587\n", + " 0.748033\n", + " -1.445145\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 8\n", - " 4048.685642\n", + " 4\n", + " 4124.107944\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.16745\n", + " 0.899528\n", + " -0.284269\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 17\n", - " 3976.454093\n", + " 8\n", + " 4670.344196\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.453222\n", + " -1.877885\n", + " 0.658942\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 23\n", - " 3541.582194\n", + " 11\n", + " 3529.417214\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.12667\n", + " 0.697535\n", + " -0.792152\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 25\n", - " 4032.844186\n", + " 13\n", + " 4014.101714\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.183094\n", + " 1.404513\n", + " -0.792152\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 27\n", - " 4118.351772\n", + " 15\n", + " 5212.41288\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.867003\n", + " -0.766919\n", + " 0.513833\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 29\n", - " 4087.767826\n", + " 16\n", + " 4163.595615\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.784958\n", + " 1.959995\n", + " -0.211715\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 34\n", - " 3183.755249\n", + " 23\n", + " 3392.453069\n", + " [{'index': 2, 'value': 1.0}]\n", + " -0.355532\n", + " 0.647036\n", + " -1.5177\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 35\n", - " 3418.802274\n", - " \n", - " \n", - " 39\n", - " 3519.186468\n", + " 34\n", + " 4698.305397\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.600039\n", + " -1.776888\n", + " 0.949161\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 51\n", - " 3398.135365\n", + " 36\n", + " 4828.226949\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.129833\n", + " -1.423399\n", + " 1.23938\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 52\n", - " 3223.615957\n", + " 42\n", + " 3430.58866\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.615684\n", + " -0.514427\n", + " -0.429379\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 60\n", - " 3445.014718\n", + " 48\n", + " 5314.260221\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.415606\n", + " -0.716421\n", + " 1.021716\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 61\n", - " 3505.638864\n", + " 5363.205372\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.396797\n", + " -1.170907\n", + " 1.457044\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 64\n", - " 3515.905786\n", + " 4855.908314\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.434414\n", + " -1.120408\n", + " 1.09427\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 65\n", - " 4028.363185\n", + " 3413.100524\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.220711\n", + " 1.051024\n", + " -1.445145\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 67\n", - " 4159.993943\n", + " 68\n", + " 3340.219002\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.484026\n", + " -0.009443\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 83\n", - " 3348.16883\n", + " 70\n", + " 4228.73157\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.638141\n", + " 1.404513\n", + " 0.296168\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 85\n", - " 3485.050273\n", + " 72\n", + " 3811.538478\n", + " [{'index': 2, 'value': 1.0}]\n", + " 0.829387\n", + " 0.142052\n", + " -0.719598\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 93\n", - " 4172.874548\n", + " 74\n", + " 4659.770763\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.242683\n", + " -1.524396\n", + " 0.586387\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 104\n", - " 3299.302424\n", + " 77\n", + " 3453.388804\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.277136\n", + " -0.211437\n", + " -0.647043\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 105\n", - " 3515.687917\n", + " 81\n", + " 4766.245033\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.208715\n", + " -1.221405\n", + " 0.804051\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", + " \n", + " \n", + " 91\n", + " 4057.807281\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.261976\n", + " 0.647036\n", + " 0.005949\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 108\n", - " 3405.224618\n", + " 96\n", + " 4739.827445\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.246331\n", + " -1.322402\n", + " 0.731497\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 113\n", - " 4209.140425\n", + " 105\n", + " 3394.891976\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.803766\n", + " 0.445043\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 130\n", - " 4197.905737\n", + " 111\n", + " 3201.493683\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.164286\n", + " 0.697535\n", + " -2.098138\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", "\n", - "

25 rows × 1 columns

\n", - "[67 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "penguin_id \n", - "3 3394.118128\n", - "8 4048.685642\n", - "17 3976.454093\n", - "23 3541.582194\n", - "25 4032.844186\n", - "27 4118.351772\n", - "29 4087.767826\n", - "34 3183.755249\n", - "35 3418.802274\n", - "39 3519.186468\n", - "51 3398.135365\n", - "52 3223.615957\n", - "60 3445.014718\n", - "61 3505.638864\n", - "64 3515.905786\n", - "65 4028.363185\n", - "67 4159.993943\n", - "83 3348.16883\n", - "85 3485.050273\n", - "93 4172.874548\n", - "104 3299.302424\n", - "105 3515.687917\n", - "108 3405.224618\n", - "113 4209.140425\n", - "130 4197.905737\n", - "...\n", + "

25 rows × 7 columns

\n", + "[67 rows x 7 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g onehotencoded_island \\\n", + "penguin_id \n", + "1 3781.402407 [{'index': 3, 'value': 1.0}] \n", + "4 4124.107944 [{'index': 1, 'value': 1.0}] \n", + "8 4670.344196 [{'index': 1, 'value': 1.0}] \n", + "11 3529.417214 [{'index': 2, 'value': 1.0}] \n", + "13 4014.101714 [{'index': 1, 'value': 1.0}] \n", + "15 5212.41288 [{'index': 1, 'value': 1.0}] \n", + "16 4163.595615 [{'index': 3, 'value': 1.0}] \n", + "23 3392.453069 [{'index': 2, 'value': 1.0}] \n", + "34 4698.305397 [{'index': 1, 'value': 1.0}] \n", + "36 4828.226949 [{'index': 1, 'value': 1.0}] \n", + "42 3430.58866 [{'index': 1, 'value': 1.0}] \n", + "48 5314.260221 [{'index': 1, 'value': 1.0}] \n", + "61 5363.205372 [{'index': 1, 'value': 1.0}] \n", + "64 4855.908314 [{'index': 1, 'value': 1.0}] \n", + "65 3413.100524 [{'index': 2, 'value': 1.0}] \n", + "68 3340.219002 [{'index': 3, 'value': 1.0}] \n", + "70 4228.73157 [{'index': 2, 'value': 1.0}] \n", + "72 3811.538478 [{'index': 2, 'value': 1.0}] \n", + "74 4659.770763 [{'index': 1, 'value': 1.0}] \n", + "77 3453.388804 [{'index': 2, 'value': 1.0}] \n", + "81 4766.245033 [{'index': 1, 'value': 1.0}] \n", + "91 4057.807281 [{'index': 2, 'value': 1.0}] \n", + "96 4739.827445 [{'index': 1, 'value': 1.0}] \n", + "105 3394.891976 [{'index': 1, 'value': 1.0}] \n", + "111 3201.493683 [{'index': 1, 'value': 1.0}] \n", "\n", - "[67 rows x 1 columns]" + " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "1 -0.938587 0.748033 \n", + "4 -0.16745 0.899528 \n", + "8 0.453222 -1.877885 \n", + "11 -1.12667 0.697535 \n", + "13 -1.183094 1.404513 \n", + "15 0.867003 -0.766919 \n", + "16 -1.784958 1.959995 \n", + "23 -0.355532 0.647036 \n", + "34 -0.600039 -1.776888 \n", + "36 -0.129833 -1.423399 \n", + "42 -1.615684 -0.514427 \n", + "48 0.415606 -0.716421 \n", + "61 0.396797 -1.170907 \n", + "64 0.434414 -1.120408 \n", + "65 -1.220711 1.051024 \n", + "68 -1.484026 -0.009443 \n", + "70 1.638141 1.404513 \n", + "72 0.829387 0.142052 \n", + "74 -0.242683 -1.524396 \n", + "77 -1.277136 -0.211437 \n", + "81 0.208715 -1.221405 \n", + "91 1.261976 0.647036 \n", + "96 0.246331 -1.322402 \n", + "105 -1.803766 0.445043 \n", + "111 -1.164286 0.697535 \n", + "\n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", + "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "8 0.658942 [{'index': 1, 'value': 1.0}] \n", + "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", + "15 0.513833 [{'index': 2, 'value': 1.0}] \n", + "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", + "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", + "34 0.949161 [{'index': 1, 'value': 1.0}] \n", + "36 1.23938 [{'index': 1, 'value': 1.0}] \n", + "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", + "48 1.021716 [{'index': 2, 'value': 1.0}] \n", + "61 1.457044 [{'index': 2, 'value': 1.0}] \n", + "64 1.09427 [{'index': 1, 'value': 1.0}] \n", + "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", + "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "70 0.296168 [{'index': 2, 'value': 1.0}] \n", + "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", + "74 0.586387 [{'index': 1, 'value': 1.0}] \n", + "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", + "81 0.804051 [{'index': 1, 'value': 1.0}] \n", + "91 0.005949 [{'index': 2, 'value': 1.0}] \n", + "96 0.731497 [{'index': 1, 'value': 1.0}] \n", + "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "8 [{'index': 3, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 3, 'value': 1.0}] \n", + "16 [{'index': 1, 'value': 1.0}] \n", + "23 [{'index': 1, 'value': 1.0}] \n", + "34 [{'index': 3, 'value': 1.0}] \n", + "36 [{'index': 3, 'value': 1.0}] \n", + "42 [{'index': 1, 'value': 1.0}] \n", + "48 [{'index': 3, 'value': 1.0}] \n", + "61 [{'index': 3, 'value': 1.0}] \n", + "64 [{'index': 3, 'value': 1.0}] \n", + "65 [{'index': 1, 'value': 1.0}] \n", + "68 [{'index': 1, 'value': 1.0}] \n", + "70 [{'index': 2, 'value': 1.0}] \n", + "72 [{'index': 2, 'value': 1.0}] \n", + "74 [{'index': 3, 'value': 1.0}] \n", + "77 [{'index': 1, 'value': 1.0}] \n", + "81 [{'index': 3, 'value': 1.0}] \n", + "91 [{'index': 2, 'value': 1.0}] \n", + "96 [{'index': 3, 'value': 1.0}] \n", + "105 [{'index': 1, 'value': 1.0}] \n", + "111 [{'index': 1, 'value': 1.0}] \n", + "\n", + "[67 rows x 7 columns]" ] }, - "execution_count": 25, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -2423,18 +2554,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7a16e04253a42b7a5ce247d8f63b656", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 027042f1-9a18-43d8-a378-ab9410e395b1 is DONE. 23.5 kB processed.
Open Job" + ], "text/plain": [ - "HTML(value='Query job 6f19614c-82c0-4f8b-b74b-9d91a894efdd is RUNNING. " ] }, "metadata": {}, @@ -2442,13 +2571,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a99ac15431e433595de1040872a4558", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 6c8484a0-a504-4e50-93d6-3d247c9ff558 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 51899e2d-f6ef-4e62-98b6-c11550f74f4b is RUNNING. " ] }, "metadata": {}, @@ -2456,13 +2583,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "90909b620e084f59b0f9da266257593f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e81ca2de-df2e-41ec-af86-14f8dcec1b44 is DONE. 6.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 44d3fddc-74bc-4de0-a458-2c73b38f74fb is RUNNING. " ] }, "metadata": {}, @@ -2470,13 +2595,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2a9c2c05041a4fb691809bab5310bb05", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 3e6d413c-f8c4-4390-95eb-3a1f5bc59aed is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 33584475-f02b-4c98-9a51-e29996f4f950 is RUNNING. " ] }, "metadata": {}, @@ -2484,13 +2607,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6b0677c228d54b409c66e5dfa98d7e00", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e448220d-0c50-45b7-bcbe-d1159b3d18ce is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job df25ba49-280e-424d-a357-dde71a9b35dd is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2498,13 +2619,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "379ae6497fb34f969d21b2cd664e8bfa", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e167a234-828d-4f05-8654-63cf97e50ba3 is DONE. 10.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6f92a04e-af7e-41d6-9303-6366c1751294 is RUNNING. " ] }, "metadata": {}, @@ -2532,152 +2651,452 @@ " \n", " \n", " CENTROID_ID\n", + " NEAREST_CENTROIDS_DISTANCE\n", + " onehotencoded_island\n", + " standard_scaled_culmen_length_mm\n", + " standard_scaled_culmen_depth_mm\n", + " standard_scaled_flipper_length_mm\n", + " onehotencoded_sex\n", + " onehotencoded_species\n", " \n", " \n", " penguin_id\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", - " 3\n", - " \n", - " \n", - " 8\n", + " 1\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035...\n", + " [{'index': 3, 'value': 1.0}]\n", + " -0.938587\n", + " 0.748033\n", + " -1.445145\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 17\n", + " 4\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.16745\n", + " 0.899528\n", + " -0.284269\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 23\n", + " 8\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.453222\n", + " -1.877885\n", + " 0.658942\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 25\n", - " 3\n", + " 11\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.12667\n", + " 0.697535\n", + " -0.792152\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 27\n", + " 13\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.183094\n", + " 1.404513\n", + " -0.792152\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 29\n", - " 3\n", + " 15\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.867003\n", + " -0.766919\n", + " 0.513833\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 34\n", + " 16\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720...\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.784958\n", + " 1.959995\n", + " -0.211715\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 35\n", - " 1\n", + " 23\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -0.355532\n", + " 0.647036\n", + " -1.5177\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 39\n", - " 3\n", + " 34\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.600039\n", + " -1.776888\n", + " 0.949161\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 51\n", + " 36\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.129833\n", + " -1.423399\n", + " 1.23938\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 52\n", - " 3\n", + " 42\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.615684\n", + " -0.514427\n", + " -0.429379\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 60\n", - " 3\n", + " 48\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.415606\n", + " -0.716421\n", + " 1.021716\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 61\n", - " 3\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.396797\n", + " -1.170907\n", + " 1.457044\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 64\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.434414\n", + " -1.120408\n", + " 1.09427\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 65\n", - " 1\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.220711\n", + " 1.051024\n", + " -1.445145\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 67\n", - " 3\n", + " 68\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449...\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.484026\n", + " -0.009443\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 83\n", - " 3\n", + " 70\n", + " 4\n", + " [{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955...\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.638141\n", + " 1.404513\n", + " 0.296168\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 85\n", - " 1\n", + " 72\n", + " 4\n", + " [{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009...\n", + " [{'index': 2, 'value': 1.0}]\n", + " 0.829387\n", + " 0.142052\n", + " -0.719598\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 93\n", + " 74\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.242683\n", + " -1.524396\n", + " 0.586387\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 104\n", - " 3\n", + " 77\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.277136\n", + " -0.211437\n", + " -0.647043\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 105\n", + " 81\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.208715\n", + " -1.221405\n", + " 0.804051\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 108\n", - " 3\n", + " 91\n", + " 4\n", + " [{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086...\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.261976\n", + " 0.647036\n", + " 0.005949\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 113\n", - " 3\n", + " 96\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.246331\n", + " -1.322402\n", + " 0.731497\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 130\n", - " 1\n", + " 105\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.803766\n", + " 0.445043\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", + " \n", + " \n", + " 111\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.164286\n", + " 0.697535\n", + " -2.098138\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", "\n", - "

25 rows × 1 columns

\n", - "[67 rows x 1 columns in total]" + "

25 rows × 8 columns

\n", + "[67 rows x 8 columns in total]" ], "text/plain": [ - " CENTROID_ID\n", - "penguin_id \n", - "3 3\n", - "8 3\n", - "17 3\n", - "23 1\n", - "25 3\n", - "27 3\n", - "29 3\n", - "34 3\n", - "35 1\n", - "39 3\n", - "51 1\n", - "52 3\n", - "60 3\n", - "61 3\n", - "64 1\n", - "65 1\n", - "67 3\n", - "83 3\n", - "85 1\n", - "93 1\n", - "104 3\n", - "105 1\n", - "108 3\n", - "113 3\n", - "130 1\n", - "...\n", + " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", + "penguin_id \n", + "1 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035... \n", + "4 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856... \n", + "8 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975... \n", + "11 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012... \n", + "13 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949... \n", + "15 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772... \n", + "16 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720... \n", + "23 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483... \n", + "34 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302... \n", + "36 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190... \n", + "42 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930... \n", + "48 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196... \n", + "61 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282... \n", + "64 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625... \n", + "65 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148... \n", + "68 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449... \n", + "70 4 [{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955... \n", + "72 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009... \n", + "74 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546... \n", + "77 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663... \n", + "81 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073... \n", + "91 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086... \n", + "96 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227... \n", + "105 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029... \n", + "111 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385... \n", "\n", - "[67 rows x 1 columns]" + " onehotencoded_island standard_scaled_culmen_length_mm \\\n", + "penguin_id \n", + "1 [{'index': 3, 'value': 1.0}] -0.938587 \n", + "4 [{'index': 1, 'value': 1.0}] -0.16745 \n", + "8 [{'index': 1, 'value': 1.0}] 0.453222 \n", + "11 [{'index': 2, 'value': 1.0}] -1.12667 \n", + "13 [{'index': 1, 'value': 1.0}] -1.183094 \n", + "15 [{'index': 1, 'value': 1.0}] 0.867003 \n", + "16 [{'index': 3, 'value': 1.0}] -1.784958 \n", + "23 [{'index': 2, 'value': 1.0}] -0.355532 \n", + "34 [{'index': 1, 'value': 1.0}] -0.600039 \n", + "36 [{'index': 1, 'value': 1.0}] -0.129833 \n", + "42 [{'index': 1, 'value': 1.0}] -1.615684 \n", + "48 [{'index': 1, 'value': 1.0}] 0.415606 \n", + "61 [{'index': 1, 'value': 1.0}] 0.396797 \n", + "64 [{'index': 1, 'value': 1.0}] 0.434414 \n", + "65 [{'index': 2, 'value': 1.0}] -1.220711 \n", + "68 [{'index': 3, 'value': 1.0}] -1.484026 \n", + "70 [{'index': 2, 'value': 1.0}] 1.638141 \n", + "72 [{'index': 2, 'value': 1.0}] 0.829387 \n", + "74 [{'index': 1, 'value': 1.0}] -0.242683 \n", + "77 [{'index': 2, 'value': 1.0}] -1.277136 \n", + "81 [{'index': 1, 'value': 1.0}] 0.208715 \n", + "91 [{'index': 2, 'value': 1.0}] 1.261976 \n", + "96 [{'index': 1, 'value': 1.0}] 0.246331 \n", + "105 [{'index': 1, 'value': 1.0}] -1.803766 \n", + "111 [{'index': 1, 'value': 1.0}] -1.164286 \n", + "\n", + " standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "1 0.748033 \n", + "4 0.899528 \n", + "8 -1.877885 \n", + "11 0.697535 \n", + "13 1.404513 \n", + "15 -0.766919 \n", + "16 1.959995 \n", + "23 0.647036 \n", + "34 -1.776888 \n", + "36 -1.423399 \n", + "42 -0.514427 \n", + "48 -0.716421 \n", + "61 -1.170907 \n", + "64 -1.120408 \n", + "65 1.051024 \n", + "68 -0.009443 \n", + "70 1.404513 \n", + "72 0.142052 \n", + "74 -1.524396 \n", + "77 -0.211437 \n", + "81 -1.221405 \n", + "91 0.647036 \n", + "96 -1.322402 \n", + "105 0.445043 \n", + "111 0.697535 \n", + "\n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", + "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "8 0.658942 [{'index': 1, 'value': 1.0}] \n", + "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", + "15 0.513833 [{'index': 2, 'value': 1.0}] \n", + "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", + "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", + "34 0.949161 [{'index': 1, 'value': 1.0}] \n", + "36 1.23938 [{'index': 1, 'value': 1.0}] \n", + "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", + "48 1.021716 [{'index': 2, 'value': 1.0}] \n", + "61 1.457044 [{'index': 2, 'value': 1.0}] \n", + "64 1.09427 [{'index': 1, 'value': 1.0}] \n", + "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", + "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "70 0.296168 [{'index': 2, 'value': 1.0}] \n", + "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", + "74 0.586387 [{'index': 1, 'value': 1.0}] \n", + "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", + "81 0.804051 [{'index': 1, 'value': 1.0}] \n", + "91 0.005949 [{'index': 2, 'value': 1.0}] \n", + "96 0.731497 [{'index': 1, 'value': 1.0}] \n", + "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "8 [{'index': 3, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 3, 'value': 1.0}] \n", + "16 [{'index': 1, 'value': 1.0}] \n", + "23 [{'index': 1, 'value': 1.0}] \n", + "34 [{'index': 3, 'value': 1.0}] \n", + "36 [{'index': 3, 'value': 1.0}] \n", + "42 [{'index': 1, 'value': 1.0}] \n", + "48 [{'index': 3, 'value': 1.0}] \n", + "61 [{'index': 3, 'value': 1.0}] \n", + "64 [{'index': 3, 'value': 1.0}] \n", + "65 [{'index': 1, 'value': 1.0}] \n", + "68 [{'index': 1, 'value': 1.0}] \n", + "70 [{'index': 2, 'value': 1.0}] \n", + "72 [{'index': 2, 'value': 1.0}] \n", + "74 [{'index': 3, 'value': 1.0}] \n", + "77 [{'index': 1, 'value': 1.0}] \n", + "81 [{'index': 3, 'value': 1.0}] \n", + "91 [{'index': 2, 'value': 1.0}] \n", + "96 [{'index': 3, 'value': 1.0}] \n", + "105 [{'index': 1, 'value': 1.0}] \n", + "111 [{'index': 1, 'value': 1.0}] \n", + "\n", + "[67 rows x 8 columns]" ] }, - "execution_count": 26, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -2704,7 +3123,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -2721,7 +3140,7 @@ " ('linreg', LinearRegression())])" ] }, - "execution_count": 27, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -2748,18 +3167,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "887bf58cebf14bdba95db828390fd33d", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job b11be0d8-e6f1-41cb-8cb2-25a38e7ef311 is DONE. 24.7 kB processed.
Open Job" + ], "text/plain": [ - "HTML(value='Query job ed42cbb3-3d25-47ca-96c5-71a84e426a8c is RUNNING. " ] }, "metadata": {}, @@ -2767,13 +3184,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "24357055792a4eaaa60997fea0f76921", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job f32ea25c-be39-4726-a8f5-604ae83849a6 is DONE. 8.5 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 3fc74930-03b9-4a49-8ed3-c3edc4dd6e51 is RUNNING. " ] }, "metadata": {}, @@ -2781,13 +3196,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bba878d6d3e345f1a29aea50f7101e8f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 86e29b78-76f5-4937-8bde-407b99af04a2 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 38a4ce3b-5c2a-4d44-b826-f24529d6500b is RUNNING. " ] }, "metadata": {}, @@ -2795,13 +3208,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4bc2c53aeb7d4a8280f9fbbe373f4b55", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ca819734-0d41-4d9e-b743-09edae8c7fee is DONE. 29.6 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ecad776d-77c8-4d94-8186-d5571b512b62 is RUNNING. " ] }, "metadata": {}, @@ -2809,13 +3220,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f4f695cb0a224102b6e26adeb1827981", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 49bb5bed-cc84-47e0-9a90-08ab01e00548 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c9bfc58f-ce2c-47a9-bbc7-b10d9de9b5a6 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2823,13 +3232,23 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cb1df595006d485288a1060299970e5e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1e40a085-2289-47dd-afd8-820413186b9f is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 8fd8036e-3753-433d-975b-c7b42406f648 is RUNNING. " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 60319296-a480-4f51-b7ad-190ac6de963a is DONE. 6.2 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, "metadata": {}, @@ -2857,152 +3276,369 @@ " \n", " \n", " predicted_body_mass_g\n", + " island\n", + " culmen_length_mm\n", + " culmen_depth_mm\n", + " flipper_length_mm\n", + " sex\n", + " species\n", " \n", " \n", " penguin_id\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", - " 3394.116212\n", - " \n", - " \n", - " 8\n", - " 4048.683645\n", + " 1\n", + " 3781.396682\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 17\n", - " 3976.452358\n", + " 4\n", + " 4124.102574\n", + " Biscoe\n", + " 43.2\n", + " 19.0\n", + " 197.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 23\n", - " 3541.580346\n", + " 8\n", + " 4670.338389\n", + " Biscoe\n", + " 46.5\n", + " 13.5\n", + " 210.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 25\n", - " 4032.842027\n", + " 11\n", + " 3529.411644\n", + " Dream\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 27\n", - " 4118.34983\n", + " 13\n", + " 4014.09632\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 29\n", - " 4087.765797\n", + " 15\n", + " 5212.407319\n", + " Biscoe\n", + " 48.7\n", + " 15.7\n", + " 208.0\n", + " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 34\n", - " 3183.75379\n", + " 16\n", + " 4163.590502\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 35\n", - " 3418.800633\n", + " 23\n", + " 3392.44731\n", + " Dream\n", + " 42.2\n", + " 18.5\n", + " 180.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 39\n", - " 3519.18471\n", + " 34\n", + " 4698.299674\n", + " Biscoe\n", + " 40.9\n", + " 13.7\n", + " 214.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 51\n", - " 3398.133564\n", + " 36\n", + " 4828.221398\n", + " Biscoe\n", + " 43.4\n", + " 14.4\n", + " 218.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 52\n", - " 3223.614107\n", + " 42\n", + " 3430.582874\n", + " Biscoe\n", + " 35.5\n", + " 16.2\n", + " 195.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 60\n", - " 3445.012713\n", + " 48\n", + " 5314.254798\n", + " Biscoe\n", + " 46.3\n", + " 15.8\n", + " 215.0\n", + " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", " 61\n", - " 3505.637004\n", + " 5363.19995\n", + " Biscoe\n", + " 46.2\n", + " 14.9\n", + " 221.0\n", + " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", " 64\n", - " 3515.903779\n", + " 4855.90281\n", + " Biscoe\n", + " 46.4\n", + " 15.0\n", + " 216.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", " 65\n", - " 4028.361259\n", + " 3413.094869\n", + " Dream\n", + " 37.6\n", + " 19.3\n", + " 181.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 67\n", - " 4159.991956\n", + " 68\n", + " 3340.213193\n", + " Torgersen\n", + " 36.2\n", + " 17.2\n", + " 187.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 83\n", - " 3348.167212\n", + " 70\n", + " 4228.726508\n", + " Dream\n", + " 52.8\n", + " 20.0\n", + " 205.0\n", + " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " \n", " \n", - " 85\n", - " 3485.048557\n", + " 72\n", + " 3811.532821\n", + " Dream\n", + " 48.5\n", + " 17.5\n", + " 191.0\n", + " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " \n", " \n", - " 93\n", - " 4172.872284\n", + " 74\n", + " 4659.765013\n", + " Biscoe\n", + " 42.8\n", + " 14.2\n", + " 209.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 104\n", - " 3299.300454\n", + " 77\n", + " 3453.383042\n", + " Dream\n", + " 37.3\n", + " 16.8\n", + " 192.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 105\n", - " 3515.68617\n", + " 81\n", + " 4766.239424\n", + " Biscoe\n", + " 45.2\n", + " 14.8\n", + " 212.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 108\n", - " 3405.222757\n", + " 91\n", + " 4057.801947\n", + " Dream\n", + " 50.8\n", + " 18.5\n", + " 201.0\n", + " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " \n", + " \n", + " 96\n", + " 4739.821792\n", + " Biscoe\n", + " 45.4\n", + " 14.6\n", + " 211.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 113\n", - " 4209.13832\n", + " 105\n", + " 3394.886275\n", + " Biscoe\n", + " 34.5\n", + " 18.1\n", + " 187.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 130\n", - " 4197.90382\n", + " 111\n", + " 3201.48777\n", + " Biscoe\n", + " 37.9\n", + " 18.6\n", + " 172.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", "\n", - "

25 rows × 1 columns

\n", - "[67 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "penguin_id \n", - "3 3394.116212\n", - "8 4048.683645\n", - "17 3976.452358\n", - "23 3541.580346\n", - "25 4032.842027\n", - "27 4118.34983\n", - "29 4087.765797\n", - "34 3183.75379\n", - "35 3418.800633\n", - "39 3519.18471\n", - "51 3398.133564\n", - "52 3223.614107\n", - "60 3445.012713\n", - "61 3505.637004\n", - "64 3515.903779\n", - "65 4028.361259\n", - "67 4159.991956\n", - "83 3348.167212\n", - "85 3485.048557\n", - "93 4172.872284\n", - "104 3299.300454\n", - "105 3515.68617\n", - "108 3405.222757\n", - "113 4209.13832\n", - "130 4197.90382\n", - "...\n", + "

25 rows × 7 columns

\n", + "[67 rows x 7 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g island culmen_length_mm \\\n", + "penguin_id \n", + "1 3781.396682 Torgersen 39.1 \n", + "4 4124.102574 Biscoe 43.2 \n", + "8 4670.338389 Biscoe 46.5 \n", + "11 3529.411644 Dream 38.1 \n", + "13 4014.09632 Biscoe 37.8 \n", + "15 5212.407319 Biscoe 48.7 \n", + "16 4163.590502 Torgersen 34.6 \n", + "23 3392.44731 Dream 42.2 \n", + "34 4698.299674 Biscoe 40.9 \n", + "36 4828.221398 Biscoe 43.4 \n", + "42 3430.582874 Biscoe 35.5 \n", + "48 5314.254798 Biscoe 46.3 \n", + "61 5363.19995 Biscoe 46.2 \n", + "64 4855.90281 Biscoe 46.4 \n", + "65 3413.094869 Dream 37.6 \n", + "68 3340.213193 Torgersen 36.2 \n", + "70 4228.726508 Dream 52.8 \n", + "72 3811.532821 Dream 48.5 \n", + "74 4659.765013 Biscoe 42.8 \n", + "77 3453.383042 Dream 37.3 \n", + "81 4766.239424 Biscoe 45.2 \n", + "91 4057.801947 Dream 50.8 \n", + "96 4739.821792 Biscoe 45.4 \n", + "105 3394.886275 Biscoe 34.5 \n", + "111 3201.48777 Biscoe 37.9 \n", "\n", - "[67 rows x 1 columns]" + " culmen_depth_mm flipper_length_mm sex \\\n", + "penguin_id \n", + "1 18.7 181.0 MALE \n", + "4 19.0 197.0 MALE \n", + "8 13.5 210.0 FEMALE \n", + "11 18.6 190.0 FEMALE \n", + "13 20.0 190.0 MALE \n", + "15 15.7 208.0 MALE \n", + "16 21.1 198.0 MALE \n", + "23 18.5 180.0 FEMALE \n", + "34 13.7 214.0 FEMALE \n", + "36 14.4 218.0 FEMALE \n", + "42 16.2 195.0 FEMALE \n", + "48 15.8 215.0 MALE \n", + "61 14.9 221.0 MALE \n", + "64 15.0 216.0 FEMALE \n", + "65 19.3 181.0 FEMALE \n", + "68 17.2 187.0 FEMALE \n", + "70 20.0 205.0 MALE \n", + "72 17.5 191.0 MALE \n", + "74 14.2 209.0 FEMALE \n", + "77 16.8 192.0 FEMALE \n", + "81 14.8 212.0 FEMALE \n", + "91 18.5 201.0 MALE \n", + "96 14.6 211.0 FEMALE \n", + "105 18.1 187.0 FEMALE \n", + "111 18.6 172.0 FEMALE \n", + "\n", + " species \n", + "penguin_id \n", + "1 Adelie Penguin (Pygoscelis adeliae) \n", + "4 Adelie Penguin (Pygoscelis adeliae) \n", + "8 Gentoo penguin (Pygoscelis papua) \n", + "11 Adelie Penguin (Pygoscelis adeliae) \n", + "13 Adelie Penguin (Pygoscelis adeliae) \n", + "15 Gentoo penguin (Pygoscelis papua) \n", + "16 Adelie Penguin (Pygoscelis adeliae) \n", + "23 Adelie Penguin (Pygoscelis adeliae) \n", + "34 Gentoo penguin (Pygoscelis papua) \n", + "36 Gentoo penguin (Pygoscelis papua) \n", + "42 Adelie Penguin (Pygoscelis adeliae) \n", + "48 Gentoo penguin (Pygoscelis papua) \n", + "61 Gentoo penguin (Pygoscelis papua) \n", + "64 Gentoo penguin (Pygoscelis papua) \n", + "65 Adelie Penguin (Pygoscelis adeliae) \n", + "68 Adelie Penguin (Pygoscelis adeliae) \n", + "70 Chinstrap penguin (Pygoscelis antarctica) \n", + "72 Chinstrap penguin (Pygoscelis antarctica) \n", + "74 Gentoo penguin (Pygoscelis papua) \n", + "77 Adelie Penguin (Pygoscelis adeliae) \n", + "81 Gentoo penguin (Pygoscelis papua) \n", + "91 Chinstrap penguin (Pygoscelis antarctica) \n", + "96 Gentoo penguin (Pygoscelis papua) \n", + "105 Adelie Penguin (Pygoscelis adeliae) \n", + "111 Adelie Penguin (Pygoscelis adeliae) \n", + "\n", + "[67 rows x 7 columns]" ] }, - "execution_count": 28, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -3034,60 +3670,16 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2d32081be31f44abb8de67e2209d76cd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 2a043039-670f-4eb8-9cf0-765ee6ed7de6 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job bc8b2042-1e13-441c-9531-300ed5badb7a is RUNNING. " ] }, "metadata": {}, @@ -3095,13 +3687,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4588ae10de634460bf4026ddd9076351", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 7f1f565b-0f73-4a4e-b33f-8484fa260838 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 5e867182-dd7a-4aff-87a8-f7596e900fd5 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -3109,13 +3699,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8209cf8286a545ebb7b6ef9d002a43a1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job d4b9d4a6-d75e-46e1-b092-ab58e8aef890 is DONE. 48 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d4cdb016-8f1e-4960-8ed7-4524ccc5a8a8 is RUNNING. " ] }, "metadata": {}, @@ -3153,12 +3741,12 @@ " \n", " \n", " 0\n", - " 229.48269\n", - " 82962.794947\n", - " 0.004248\n", - " 206.728384\n", - " 0.88633\n", - " 0.892953\n", + " 216.444357\n", + " 72639.698707\n", + " 0.00463\n", + " 170.588356\n", + " 0.896396\n", + " 0.900547\n", " \n", " \n", "\n", @@ -3167,15 +3755,15 @@ ], "text/plain": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 229.48269 82962.794947 0.004248 \n", + "0 216.444357 72639.698707 0.00463 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 206.728384 0.88633 0.892953 \n", + "0 170.588356 0.896396 0.900547 \n", "\n", "[1 rows x 6 columns]" ] }, - "execution_count": 29, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -3195,18 +3783,16 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f32692d89f00406499f4ea5aa55268fb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 73448ee8-698b-435f-b11e-6fe2de3bcd8d is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e57383ef-f043-458b-96c6-893e7c5b0c00 is RUNNING. " ] }, "metadata": {}, @@ -3214,13 +3800,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "72e5f23a99de4a818c8493b8b4f3854d", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e002f59d-a03c-4ec9-a85a-93adbfd7bd17 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1a9db485-477b-43e2-94eb-dea7dc21d45d is RUNNING. " ] }, "metadata": {}, @@ -3228,13 +3812,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9d5333a91b504dd9be51c997715530ab", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 4ab1febc-fb55-473a-b295-69e4329cc5f0 is DONE. 30.0 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4570a563-b8e0-4308-b8cb-c4731491d4f7 is RUNNING. " ] }, "metadata": {}, @@ -3243,10 +3825,10 @@ { "data": { "text/plain": [ - "0.8863300923278365" + "0.8963962044533755" ] }, - "execution_count": 30, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3254,7 +3836,7 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "r2_score(y_test, predicted_y_test)" + "r2_score(y_test, predicted_y_test[\"predicted_body_mass_g\"])" ] }, { @@ -3274,57 +3856,9 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fbc4a70f31d4465b974a7f7c9cc97731", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Copy job c2413be4-6972-4e36-8234-5063628b6d71 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 31a5b656-000e-4238-9fd9-c6e644ca298f is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -67,13 +37,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0f25faa156584cc59dda9b0e60f72534", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 12e0f983-695e-4903-8ff1-2f353d7e8cba is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d8eed0ca-7ce9-4ed8-a592-e16af9f9db8d is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -113,250 +81,250 @@ " \n", " 0\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", + " MALE\n", " \n", " \n", " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 2\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 47.4\n", + " 14.6\n", + " 212.0\n", + " 4725.0\n", + " FEMALE\n", " \n", " \n", " 3\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.5\n", - " 17.9\n", - " 192.0\n", - " 3500.0\n", + " 42.5\n", + " 16.7\n", + " 187.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", " 4\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", + " Biscoe\n", + " 43.2\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", + " MALE\n", " \n", " \n", " 5\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.7\n", + " 15.3\n", + " 219.0\n", + " 5200.0\n", " MALE\n", " \n", " \n", " 6\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 46.9\n", - " 16.6\n", - " 192.0\n", - " 2700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 7\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.5\n", - " 18.4\n", - " 200.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.2\n", + " 13.8\n", + " 215.0\n", + " 4750.0\n", " FEMALE\n", " \n", " \n", " 8\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 49.5\n", - " 19.0\n", - " 200.0\n", - " 3800.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.5\n", + " 13.5\n", + " 210.0\n", + " 4550.0\n", + " FEMALE\n", " \n", " \n", " 9\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.5\n", + " 15.2\n", + " 216.0\n", + " 5000.0\n", + " FEMALE\n", " \n", " \n", " 10\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.2\n", + " 15.6\n", + " 221.0\n", + " 5100.0\n", " MALE\n", " \n", " \n", " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", + " FEMALE\n", " \n", " \n", " 12\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.7\n", + " 15.0\n", + " 223.0\n", + " 5550.0\n", + " MALE\n", " \n", " \n", " 13\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 47.0\n", - " 17.3\n", - " 185.0\n", - " 3700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", " 14\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", " 15\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.7\n", + " 15.7\n", + " 208.0\n", + " 5350.0\n", + " MALE\n", " \n", " \n", " 16\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 45.7\n", - " 17.3\n", - " 193.0\n", - " 3600.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 17\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.6\n", - " 19.4\n", - " 193.0\n", - " 3800.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.8\n", + " 15.4\n", + " 215.0\n", + " 5150.0\n", " MALE\n", " \n", " \n", " 18\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 50.3\n", + " 20.0\n", + " 197.0\n", + " 3300.0\n", " MALE\n", " \n", " \n", " 19\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", " 20\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.6\n", - " 17.8\n", - " 193.0\n", - " 3800.0\n", - " FEMALE\n", + " 51.0\n", + " 18.8\n", + " 203.0\n", + " 4100.0\n", + " MALE\n", " \n", " \n", " 21\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 51.3\n", - " 19.2\n", - " 193.0\n", - " 3650.0\n", - " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", + " FEMALE\n", " \n", " \n", " 22\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.5\n", + " 13.9\n", + " 210.0\n", + " 4200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", " 24\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 49.6\n", - " 18.2\n", - " 193.0\n", + " 51.7\n", + " 20.3\n", + " 194.0\n", " 3775.0\n", " MALE\n", " \n", @@ -366,65 +334,65 @@ "[344 rows x 7 columns in total]" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", + " species island culmen_length_mm \\\n", + "0 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.1 \n", + "1 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.1 \n", + "2 Gentoo penguin (Pygoscelis papua) Biscoe 47.4 \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", + "4 Adelie Penguin (Pygoscelis adeliae) Biscoe 43.2 \n", + "5 Gentoo penguin (Pygoscelis papua) Biscoe 46.7 \n", + "6 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", + "7 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "8 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", + "10 Gentoo penguin (Pygoscelis papua) Biscoe 48.2 \n", + "11 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.8 \n", + "14 Adelie Penguin (Pygoscelis adeliae) Biscoe 35.0 \n", + "15 Gentoo penguin (Pygoscelis papua) Biscoe 48.7 \n", + "16 Adelie Penguin (Pygoscelis adeliae) Torgersen 34.6 \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.8 \n", + "18 Chinstrap penguin (Pygoscelis antarctica) Dream 50.3 \n", + "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", + "20 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", + "21 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", + "23 Adelie Penguin (Pygoscelis adeliae) Dream 42.2 \n", + "24 Chinstrap penguin (Pygoscelis antarctica) Dream 51.7 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.4 184.0 3475.0 FEMALE \n", - "1 19.1 184.0 4650.0 MALE \n", - "2 18.9 184.0 3900.0 MALE \n", - "3 17.9 192.0 3500.0 FEMALE \n", - "4 16.8 192.0 3000.0 FEMALE \n", - "5 18.5 192.0 4100.0 MALE \n", - "6 16.6 192.0 2700.0 FEMALE \n", - "7 18.4 200.0 3400.0 FEMALE \n", - "8 19.0 200.0 3800.0 MALE \n", - "9 20.1 200.0 3975.0 MALE \n", - "10 18.9 208.0 4300.0 MALE \n", - "11 18.7 185.0 3650.0 MALE \n", - "12 16.9 185.0 3000.0 FEMALE \n", - "13 17.3 185.0 3700.0 FEMALE \n", - "14 17.1 185.0 3400.0 FEMALE \n", - "15 16.5 185.0 3400.0 FEMALE \n", - "16 17.3 193.0 3600.0 FEMALE \n", - "17 19.4 193.0 3800.0 MALE \n", - "18 17.9 193.0 4250.0 MALE \n", - "19 18.1 193.0 3750.0 MALE \n", - "20 17.8 193.0 3800.0 FEMALE \n", - "21 19.2 193.0 3650.0 MALE \n", - "22 17.1 193.0 3400.0 FEMALE \n", - "23 18.5 193.0 3500.0 FEMALE \n", - "24 18.2 193.0 3775.0 MALE \n", + "0 18.9 188.0 4300.0 MALE \n", + "1 18.7 181.0 3750.0 MALE \n", + "2 14.6 212.0 4725.0 FEMALE \n", + "3 16.7 187.0 3350.0 FEMALE \n", + "4 19.0 197.0 4775.0 MALE \n", + "5 15.3 219.0 5200.0 MALE \n", + "6 21.1 195.0 4400.0 MALE \n", + "7 13.8 215.0 4750.0 FEMALE \n", + "8 13.5 210.0 4550.0 FEMALE \n", + "9 15.2 216.0 5000.0 FEMALE \n", + "10 15.6 221.0 5100.0 MALE \n", + "11 18.6 190.0 3700.0 FEMALE \n", + "12 15.0 223.0 5550.0 MALE \n", + "13 20.0 190.0 4250.0 MALE \n", + "14 17.9 190.0 3450.0 FEMALE \n", + "15 15.7 208.0 5350.0 MALE \n", + "16 21.1 198.0 4400.0 MALE \n", + "17 15.4 215.0 5150.0 MALE \n", + "18 20.0 197.0 3300.0 MALE \n", + "19 18.1 178.0 3900.0 MALE \n", + "20 18.8 203.0 4100.0 MALE \n", + "21 17.9 187.0 3200.0 FEMALE \n", + "22 13.9 210.0 4200.0 FEMALE \n", + "23 18.5 180.0 3550.0 FEMALE \n", + "24 20.3 194.0 3775.0 MALE \n", "...\n", "\n", "[344 rows x 7 columns]" ] }, - "execution_count": 12, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -450,32 +418,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a9ad907fa6e64a61a9dce420bc7d2beb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 3537a10a-641a-4d40-ae47-449c641b1bc5 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 34101409-7c65-4045-ad52-c6ba24dc9cbb is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -483,13 +435,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "514e68d5b0b4452a9ccdff947848541a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2af0b0d6-c11b-499e-8d25-a2c628b2853b is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 74190ac2-21a2-47b0-bc21-ef5373565f17 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -527,294 +477,294 @@ " \n", " \n", " 0\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", - " \n", - " \n", - " 1\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", " MALE\n", " \n", " \n", - " 2\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", + " 1\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 4\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", - " \n", - " \n", - " 5\n", - " Dream\n", + " Biscoe\n", " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", - " MALE\n", - " \n", - " \n", - " 9\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", " MALE\n", " \n", " \n", - " 10\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " 6\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", " MALE\n", " \n", " \n", " 11\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", - " \n", - " \n", - " 12\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", - " 14\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " 13\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", - " 15\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", + " 14\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", - " 18\n", - " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 16\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", " MALE\n", " \n", " \n", " 19\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", - " 22\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " 21\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", - " 26\n", + " 30\n", " Dream\n", - " 41.5\n", - " 18.5\n", - " 201.0\n", - " 4000.0\n", + " 39.2\n", + " 21.1\n", + " 196.0\n", + " 4150.0\n", " MALE\n", " \n", " \n", - " 31\n", - " Dream\n", - " 33.1\n", - " 16.1\n", - " 178.0\n", - " 2900.0\n", - " FEMALE\n", + " 32\n", + " Torgersen\n", + " 42.9\n", + " 17.6\n", + " 196.0\n", + " 4700.0\n", + " MALE\n", " \n", " \n", - " 32\n", + " 38\n", " Dream\n", - " 37.2\n", - " 18.1\n", - " 178.0\n", + " 41.1\n", + " 17.5\n", + " 190.0\n", " 3900.0\n", " MALE\n", " \n", " \n", - " 33\n", - " Dream\n", - " 39.5\n", - " 16.7\n", - " 178.0\n", - " 3250.0\n", + " 40\n", + " Torgersen\n", + " 38.6\n", + " 21.2\n", + " 191.0\n", + " 3800.0\n", + " MALE\n", + " \n", + " \n", + " 42\n", + " Biscoe\n", + " 35.5\n", + " 16.2\n", + " 195.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", - " 35\n", + " 44\n", " Dream\n", - " 36.0\n", - " 18.5\n", + " 39.2\n", + " 18.6\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", + " \n", + " \n", + " 45\n", + " Torgersen\n", + " 35.2\n", + " 15.9\n", " 186.0\n", - " 3100.0\n", + " 3050.0\n", " FEMALE\n", " \n", " \n", - " 36\n", + " 46\n", " Dream\n", + " 43.2\n", + " 18.5\n", + " 192.0\n", + " 4100.0\n", + " MALE\n", + " \n", + " \n", + " 49\n", + " Biscoe\n", " 39.6\n", - " 18.1\n", + " 17.7\n", " 186.0\n", - " 4450.0\n", - " MALE\n", + " 3500.0\n", + " FEMALE\n", " \n", " \n", - " 38\n", - " Dream\n", - " 41.3\n", + " 53\n", + " Biscoe\n", + " 45.6\n", " 20.3\n", - " 194.0\n", - " 3550.0\n", + " 191.0\n", + " 4600.0\n", " MALE\n", " \n", " \n", - " 41\n", - " Dream\n", - " 35.7\n", - " 18.0\n", - " 202.0\n", - " 3550.0\n", + " 58\n", + " Torgersen\n", + " 40.9\n", + " 16.8\n", + " 191.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", - " 51\n", - " Dream\n", - " 38.1\n", - " 17.6\n", - " 187.0\n", - " 3425.0\n", + " 60\n", + " Torgersen\n", + " 40.3\n", + " 18.0\n", + " 195.0\n", + " 3250.0\n", " FEMALE\n", " \n", " \n", - " 53\n", + " 62\n", " Dream\n", " 36.0\n", - " 17.1\n", - " 187.0\n", - " 3700.0\n", + " 18.5\n", + " 186.0\n", + " 3100.0\n", " FEMALE\n", " \n", + " \n", + " 63\n", + " Torgersen\n", + " 39.3\n", + " 20.6\n", + " 190.0\n", + " 3650.0\n", + " MALE\n", + " \n", " \n", "\n", "

25 rows × 6 columns

\n", "[146 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", - "0 Dream 36.6 18.4 184.0 3475.0 \n", - "1 Dream 39.8 19.1 184.0 4650.0 \n", - "2 Dream 40.9 18.9 184.0 3900.0 \n", - "4 Dream 37.3 16.8 192.0 3000.0 \n", - "5 Dream 43.2 18.5 192.0 4100.0 \n", - "9 Dream 40.2 20.1 200.0 3975.0 \n", - "10 Dream 40.8 18.9 208.0 4300.0 \n", - "11 Dream 39.0 18.7 185.0 3650.0 \n", - "12 Dream 37.0 16.9 185.0 3000.0 \n", - "14 Dream 34.0 17.1 185.0 3400.0 \n", - "15 Dream 37.0 16.5 185.0 3400.0 \n", - "18 Dream 39.7 17.9 193.0 4250.0 \n", - "19 Dream 37.8 18.1 193.0 3750.0 \n", - "22 Dream 40.2 17.1 193.0 3400.0 \n", - "23 Dream 36.8 18.5 193.0 3500.0 \n", - "26 Dream 41.5 18.5 201.0 4000.0 \n", - "31 Dream 33.1 16.1 178.0 2900.0 \n", - "32 Dream 37.2 18.1 178.0 3900.0 \n", - "33 Dream 39.5 16.7 178.0 3250.0 \n", - "35 Dream 36.0 18.5 186.0 3100.0 \n", - "36 Dream 39.6 18.1 186.0 4450.0 \n", - "38 Dream 41.3 20.3 194.0 3550.0 \n", - "41 Dream 35.7 18.0 202.0 3550.0 \n", - "51 Dream 38.1 17.6 187.0 3425.0 \n", - "53 Dream 36.0 17.1 187.0 3700.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "0 Biscoe 40.1 18.9 188.0 \n", + "1 Torgersen 39.1 18.7 181.0 \n", + "4 Biscoe 43.2 19.0 197.0 \n", + "6 Biscoe 41.3 21.1 195.0 \n", + "11 Dream 38.1 18.6 190.0 \n", + "13 Biscoe 37.8 20.0 190.0 \n", + "14 Biscoe 35.0 17.9 190.0 \n", + "16 Torgersen 34.6 21.1 198.0 \n", + "19 Dream 37.2 18.1 178.0 \n", + "21 Biscoe 40.5 17.9 187.0 \n", + "23 Dream 42.2 18.5 180.0 \n", + "30 Dream 39.2 21.1 196.0 \n", + "32 Torgersen 42.9 17.6 196.0 \n", + "38 Dream 41.1 17.5 190.0 \n", + "40 Torgersen 38.6 21.2 191.0 \n", + "42 Biscoe 35.5 16.2 195.0 \n", + "44 Dream 39.2 18.6 190.0 \n", + "45 Torgersen 35.2 15.9 186.0 \n", + "46 Dream 43.2 18.5 192.0 \n", + "49 Biscoe 39.6 17.7 186.0 \n", + "53 Biscoe 45.6 20.3 191.0 \n", + "58 Torgersen 40.9 16.8 191.0 \n", + "60 Torgersen 40.3 18.0 195.0 \n", + "62 Dream 36.0 18.5 186.0 \n", + "63 Torgersen 39.3 20.6 190.0 \n", "\n", - " sex \n", - "0 FEMALE \n", - "1 MALE \n", - "2 MALE \n", - "4 FEMALE \n", - "5 MALE \n", - "9 MALE \n", - "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "14 FEMALE \n", - "15 FEMALE \n", - "18 MALE \n", - "19 MALE \n", - "22 FEMALE \n", - "23 FEMALE \n", - "26 MALE \n", - "31 FEMALE \n", - "32 MALE \n", - "33 FEMALE \n", - "35 FEMALE \n", - "36 MALE \n", - "38 MALE \n", - "41 FEMALE \n", - "51 FEMALE \n", - "53 FEMALE \n", + " body_mass_g sex \n", + "0 4300.0 MALE \n", + "1 3750.0 MALE \n", + "4 4775.0 MALE \n", + "6 4400.0 MALE \n", + "11 3700.0 FEMALE \n", + "13 4250.0 MALE \n", + "14 3450.0 FEMALE \n", + "16 4400.0 MALE \n", + "19 3900.0 MALE \n", + "21 3200.0 FEMALE \n", + "23 3550.0 FEMALE \n", + "30 4150.0 MALE \n", + "32 4700.0 MALE \n", + "38 3900.0 MALE \n", + "40 3800.0 MALE \n", + "42 3350.0 FEMALE \n", + "44 4250.0 MALE \n", + "45 3050.0 FEMALE \n", + "46 4100.0 MALE \n", + "49 3500.0 FEMALE \n", + "53 4600.0 MALE \n", + "58 3700.0 FEMALE \n", + "60 3250.0 FEMALE \n", + "62 3100.0 FEMALE \n", + "63 3650.0 MALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -843,18 +793,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "81f9aa34c7234bd88b6b7a4bc77d4b4e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 0808457b-a0df-4a37-b7a5-8885f4a4588c is DONE. 28.9 kB processed.
Open Job" + ], "text/plain": [ - "HTML(value='Query job 288f0daa-a51e-45b4-86bf-d054467c4a99 is DONE. 28.9 kB processed. " ] }, "metadata": {}, @@ -881,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -897,7 +845,7 @@ " ('linreg', LinearRegression(fit_intercept=False))])" ] }, - "execution_count": 15, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -936,9 +884,63 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job e9bfa6a5-a53f-4d8b-ae8c-cc8cd55d0947 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d8d553cf-3d36-49aa-b18b-9a05576a1fb0 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 75ef0083-9a4f-4ffb-a6c6-d82974a1659f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Pipeline(steps=[('preproc',\n", + " ColumnTransformer(transformers=[('onehot', OneHotEncoder(),\n", + " ['island', 'species', 'sex']),\n", + " ('scaler', StandardScaler(),\n", + " ['culmen_depth_mm',\n", + " 'culmen_length_mm',\n", + " 'flipper_length_mm'])])),\n", + " ('linreg', LinearRegression(fit_intercept=False))])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.fit(X_train, y_train)" ] @@ -953,18 +955,16 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fcf406d36c0d4915b318cd30c0f3df25", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 55c5a9ce-8159-4a1a-99a4-af3a906640ba is DONE. 29.3 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 81196f97-304b-4d77-bb0f-8fc8adb8fe75 is RUNNING. " ] }, "metadata": {}, @@ -972,13 +972,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "41399a6b1d4f45328bacc6c868cefdf6", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 3e41c470-de70-4f13-89d9-c5564d0b2836 is DONE. 232 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job b417f27a-387d-4eb2-8d6d-287327ef0471 is DONE. 232 Bytes processed. " ] }, "metadata": {}, @@ -986,13 +984,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e3c17676eab448c0942c0c32689ba4b5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ed2f9042-a737-4d13-bd21-8c3d29cd61a2 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job b7f89a61-d76a-47be-8b83-917d69f255a2 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -1000,13 +996,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6c903861564b412aad9d9decad26560c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 815d16b5-0a5d-42be-a766-1cff5b8f22f2 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 9619c393-90b3-4fea-a197-d09389e9486c is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -1014,13 +1008,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2c2534cd90e64c81be45753b81b1be46", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 37a38dc6-5073-4544-a1e3-da145a843922 is DONE. 29.4 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e5854451-ffb4-4a28-a25f-3bdd68e9edae is DONE. 32.2 kB processed. " ] }, "metadata": {}, @@ -1029,10 +1021,10 @@ { "data": { "text/plain": [ - "0.6757452736197735" + "0.2655729213572775" ] }, - "execution_count": 17, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1040,9 +1032,9 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "pred_y = pipeline.predict(X_test)\n", + "y_pred = pipeline.predict(X_test)[\"predicted_body_mass_g\"]\n", "\n", - "r2_score(y_test, pred_y)" + "r2_score(y_test, y_pred)" ] }, { @@ -1055,18 +1047,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9295d6a3ff834f7a91a43d3f4ef4a61c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Load job 7b46750c-70b4-468d-87ba-9f84f579f2a6 is DONE. Open Job" + ], "text/plain": [ - "HTML(value='Load job d4c2f933-3514-4901-bcd7-888ee66eba82 is RUNNING. " ] }, "metadata": {}, @@ -1097,32 +1087,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b7eb82b3b5fc4a8e97468070a3e76300", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job e4ffd919-6f69-4382-a7e5-db37c7c1fefa is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6b3e3285-79e9-4137-bf3b-7b7185ef76a5 is DONE. 24 Bytes processed. " ] }, "metadata": {}, @@ -1130,13 +1104,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "edc7bc6434bd4be4926626a235aab65a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 207cb787-cf8a-43ea-8e73-644d3f58b11a is DONE. 24 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 173c4194-e194-43d2-8359-7bec83d3c861 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1144,13 +1116,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "55a8cbd9b1ab47eeab6e1c305847630f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job c5dc5075-cac0-4947-9e9f-06aa9cc5bd2a is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 53ba2332-590c-488d-9505-23aebaaad9cb is DONE. 48 Bytes processed. " ] }, "metadata": {}, @@ -1158,13 +1128,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "463a5b072148474db629b9346fa3a6d1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2ca4a569-7186-48ed-b3e4-004dca704798 is DONE. 282 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 66e4a8e0-4cae-4e9d-86e0-17dc24f6cfbb is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1192,41 +1160,83 @@ " \n", " \n", " predicted_body_mass_g\n", + " species\n", + " island\n", + " culmen_length_mm\n", + " culmen_depth_mm\n", + " flipper_length_mm\n", + " sex\n", " \n", " \n", " tag_number\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " 1633\n", - " 3965.994361\n", + " 4017.203152\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 39.5\n", + " 18.8\n", + " 196.0\n", + " MALE\n", " \n", " \n", " 1672\n", - " 3246.312058\n", + " 3127.601519\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 38.5\n", + " 17.2\n", + " 181.0\n", + " FEMALE\n", " \n", " \n", " 1690\n", - " 3456.404062\n", + " 3386.101231\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 37.9\n", + " 18.1\n", + " 188.0\n", + " FEMALE\n", " \n", " \n", "\n", - "

3 rows × 1 columns

\n", - "[3 rows x 1 columns in total]" + "

3 rows × 7 columns

\n", + "[3 rows x 7 columns in total]" ], "text/plain": [ - " predicted_body_mass_g\n", - "tag_number \n", - "1633 3965.994361\n", - "1672 3246.312058\n", - "1690 3456.404062\n", + " predicted_body_mass_g species \\\n", + "tag_number \n", + "1633 4017.203152 Adelie Penguin (Pygoscelis adeliae) \n", + "1672 3127.601519 Adelie Penguin (Pygoscelis adeliae) \n", + "1690 3386.101231 Adelie Penguin (Pygoscelis adeliae) \n", + "\n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "tag_number \n", + "1633 Torgersen 39.5 18.8 196.0 \n", + "1672 Torgersen 38.5 17.2 181.0 \n", + "1690 Dream 37.9 18.1 188.0 \n", "\n", - "[3 rows x 1 columns]" + " sex \n", + "tag_number \n", + "1633 MALE \n", + "1672 FEMALE \n", + "1690 FEMALE \n", + "\n", + "[3 rows x 7 columns]" ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1240,28 +1250,53 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Save in BigQuery" + "## 6. Save in BigQuery" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "Copy job d1def4a4-1da1-43a9-8ae5-4459444d993d is DONE.
Open Job" + ], "text/plain": [ - "Pipeline(steps=[('preproc',\n", - " ColumnTransformer(transformers=[('onehot', OneHotEncoder(),\n", - " ['island', 'species', 'sex']),\n", - " ('scaler', StandardScaler(),\n", - " ['culmen_depth_mm',\n", - " 'culmen_length_mm',\n", - " 'flipper_length_mm'])])),\n", - " ('linreg', LinearRegression(fit_intercept=False))])" + "" ] }, - "execution_count": 20, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Pipeline(steps=[('transform',\n", + " ColumnTransformer(transformers=[('ont_hot_encoder',\n", + " OneHotEncoder(max_categories=1000001,\n", + " min_frequency=0),\n", + " 'island'),\n", + " ('standard_scaler',\n", + " StandardScaler(),\n", + " 'culmen_length_mm'),\n", + " ('standard_scaler',\n", + " StandardScaler(),\n", + " 'culmen_depth_mm'),\n", + " ('standard_scaler',\n", + " StandardScaler(),\n", + " 'flipper_length_mm'),\n", + " ('ont_hot_encoder',\n", + " OneHotEncoder(max_categories=1000001,\n", + " min_frequency=0),\n", + " 'sex')])),\n", + " ('estimator',\n", + " LinearRegression(fit_intercept=False,\n", + " optimize_strategy='NORMAL_EQUATION'))])" + ] + }, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1269,6 +1304,13 @@ "source": [ "pipeline.to_gbq(\"bigframes-dev.bigframes_demo_us.penguin_model\", replace=True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1287,7 +1329,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.9" }, "orig_nbformat": 4, "vscode": { diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index f01116665f..9244c4b9f1 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -98,7 +98,9 @@ def test_cluster_configure_fit_score_predict( score_result, score_expected, check_exact=False, rtol=0.1 ) - result = model.predict(new_penguins).to_pandas() + predictions = model.predict(new_penguins).to_pandas() + assert predictions.shape == (4, 9) + result = predictions[["CENTROID_ID"]] expected = pd.DataFrame( {"CENTROID_ID": [2, 3, 1, 2]}, dtype="Int64", diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index a8613dfeb9..b98d7a757c 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -179,7 +179,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): ) -@pytest.mark.flaky(retries=2, delay=120) +# @pytest.mark.flaky(retries=2, delay=120) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 3e56954058..2929baf3f7 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -545,7 +545,9 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( score_result, score_expected, check_exact=False, rtol=0.1 ) - result = pl.predict(new_penguins).to_pandas().sort_index() + predictions = pl.predict(new_penguins).to_pandas().sort_index() + assert predictions.shape == (6, 9) + result = predictions[["CENTROID_ID"]] expected = pd.DataFrame( {"CENTROID_ID": [1, 2, 1, 2, 1, 2]}, dtype="Int64", diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index 266a38e3ee..a9fec0bbce 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -62,7 +62,9 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): new_penguins = session.read_pandas(_PD_NEW_PENGUINS) - result = penguins_kmeans_model.predict(new_penguins).to_pandas() + predictions = penguins_kmeans_model.predict(new_penguins).to_pandas() + assert predictions.shape == (4, 9) + result = predictions[["CENTROID_ID"]] expected = pd.DataFrame( {"CENTROID_ID": [2, 3, 1, 2]}, dtype="Int64", diff --git a/tests/system/small/ml/test_ensemble.py b/tests/system/small/ml/test_ensemble.py index bba083d98d..55d9fef661 100644 --- a/tests/system/small/ml/test_ensemble.py +++ b/tests/system/small/ml/test_ensemble.py @@ -98,7 +98,9 @@ def test_xgbregressor_model_score_series( def test_xgbregressor_model_predict( penguins_xgbregressor_model: bigframes.ml.ensemble.XGBRegressor, new_penguins_df ): - result = penguins_xgbregressor_model.predict(new_penguins_df).to_pandas() + predictions = penguins_xgbregressor_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["predicted_body_mass_g"]] expected = pandas.DataFrame( {"predicted_body_mass_g": ["4293.1538089", "3410.0271", "3357.944"]}, dtype="Float64", @@ -220,7 +222,9 @@ def test_xgbclassifier_model_score_series( def test_xgbclassifier_model_predict( penguins_xgbclassifier_model: bigframes.ml.ensemble.XGBClassifier, new_penguins_df ): - result = penguins_xgbclassifier_model.predict(new_penguins_df).to_pandas() + predictions = penguins_xgbclassifier_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 9) + result = predictions[["predicted_sex"]] expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", @@ -363,7 +367,11 @@ def test_randomforestregressor_model_predict( penguins_randomforest_regressor_model: bigframes.ml.ensemble.RandomForestRegressor, new_penguins_df, ): - result = penguins_randomforest_regressor_model.predict(new_penguins_df).to_pandas() + predictions = penguins_randomforest_regressor_model.predict( + new_penguins_df + ).to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["predicted_body_mass_g"]] expected = pandas.DataFrame( {"predicted_body_mass_g": ["3897.341797", "3458.385742", "3458.385742"]}, dtype="Float64", @@ -490,7 +498,11 @@ def test_randomforestclassifier_model_predict( penguins_randomforest_classifier_model: bigframes.ml.ensemble.RandomForestClassifier, new_penguins_df, ): - result = penguins_randomforest_classifier_model.predict(new_penguins_df).to_pandas() + predictions = penguins_randomforest_classifier_model.predict( + new_penguins_df + ).to_pandas() + assert predictions.shape == (3, 9) + result = predictions[["predicted_sex"]] expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 55079c94cf..948db59650 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -22,6 +22,8 @@ def test_model_predict(time_series_arima_plus_model): utc = pytz.utc predictions = time_series_arima_plus_model.predict().to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["forecast_timestamp", "forecast_value"]] expected = pd.DataFrame( { "forecast_timestamp": [ @@ -38,7 +40,7 @@ def test_model_predict(time_series_arima_plus_model): ) pd.testing.assert_frame_equal( - predictions, + result, expected, rtol=0.1, check_index_type=False, diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index d305567066..9008e85a0b 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -32,7 +32,9 @@ def test_tensorflow_create_model_default_session(imported_tensorflow_model_path) def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): df = llm_text_df.rename(columns={"prompt": "input"}) - result = imported_tensorflow_model.predict(df).to_pandas() + predictions = imported_tensorflow_model.predict(df).to_pandas() + assert predictions.shape == (3, 2) + result = predictions[["dense_1"]] # The values are non-human-readable. As they are a dense layer of Neural Network. # And since it is pretrained and imported, the model is a opaque-box. # We may want to switch to better test model and cases. @@ -72,7 +74,9 @@ def test_onnx_create_model_default_session(imported_onnx_model_path): def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): - result = imported_onnx_model.predict(onnx_iris_df).to_pandas() + predictions = imported_onnx_model.predict(onnx_iris_df).to_pandas() + assert predictions.shape == (3, 7) + result = predictions[["label", "probabilities"]] value1 = np.array([0.9999993443489075, 0.0, 0.0]) value2 = np.array([0.0, 0.0, 0.9999993443489075]) expected = pd.DataFrame( diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 3a8232ed9e..218c1074ab 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -91,13 +91,15 @@ def test_linear_reg_model_score_series( def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): predictions = penguins_linear_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["predicted_body_mass_g"]] expected = pandas.DataFrame( {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, dtype="Float64", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pandas.testing.assert_frame_equal( - predictions.sort_index(), + result.sort_index(), expected, check_exact=False, rtol=0.1, @@ -224,13 +226,15 @@ def test_logistic_model_score_series( def test_logsitic_model_predict(penguins_logistic_model, new_penguins_df): predictions = penguins_logistic_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 9) + result = predictions[["predicted_sex"]] expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pandas.testing.assert_frame_equal( - predictions.sort_index(), + result.sort_index(), expected, check_exact=False, rtol=0.1, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 79d3c40317..306098548e 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest import TestCase - import numpy as np import pytest @@ -48,7 +46,7 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan llm_text_df = bpd.read_pandas(llm_text_pandas_df) df = model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -72,7 +70,7 @@ def test_create_text_generator_32k_model_default_session( llm_text_df = bpd.read_pandas(llm_text_pandas_df) df = model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -97,7 +95,7 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): ) df = model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -109,7 +107,7 @@ def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -120,7 +118,7 @@ def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df ): df = palm2_text_generator_model.predict(llm_text_df["prompt"]).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -132,7 +130,7 @@ def test_text_generator_predict_arbitrary_col_label_success( ): llm_text_df = llm_text_df.rename(columns={"prompt": "arbitrary"}) df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -145,7 +143,7 @@ def test_text_generator_predict_with_params_success( df = palm2_text_generator_model.predict( llm_text_df, temperature=0.5, max_output_tokens=100, top_k=20, top_p=0.5 ).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -196,7 +194,7 @@ def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): df = palm2_embedding_generator_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] @@ -209,7 +207,7 @@ def test_embedding_generator_multilingual_predict_success( palm2_embedding_generator_multilingual_model, llm_text_df ): df = palm2_embedding_generator_multilingual_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] @@ -222,7 +220,7 @@ def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df ): df = palm2_embedding_generator_model.predict(llm_text_df["prompt"]).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 5369d3662d..be6c5e7c52 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -20,19 +20,7 @@ class _BaseKMeans(BaseEstimator, ABC): """Base class for KMeans and MiniBatchKMeans""" - def predict(self, X): - """Predict the closest cluster each sample in X belongs to. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Series or DataFrame of shape (n_samples, n_features). The data matrix for - which we want to get the predictions. - - Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the - class labels for each sample. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + pass class KMeans(_BaseKMeans): @@ -73,7 +61,7 @@ def predict( DataFrame of shape (n_samples, n_features). New data to predict. Returns: - bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 8dc3b6280a..ab946e5861 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -16,7 +16,6 @@ # Original location: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/linear_model/_base.py from abc import ABCMeta -from typing import List, Optional from bigframes import constants from third_party.bigframes_vendored.sklearn.base import ( @@ -35,7 +34,7 @@ def predict(self, X): Series or DataFrame of shape (n_samples, n_features). Samples. Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,). Returns predicted values. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -50,8 +49,7 @@ def predict(self, X): which we want to get the predictions. Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing - the class labels for each sample. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index b7b43b85a3..dfd0ba7356 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -18,7 +18,7 @@ def predict(self, X): Series or DataFrame of shape (n_samples, n_features). Samples. Returns: - DataFrame of shape (n_samples,): Returns predicted values. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From a7298317ea2604faa6ae31817f1f729d7e0b9818 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 16 Nov 2023 14:44:14 -0800 Subject: [PATCH 03/26] fix: invalid JSON type of the notebook (#215) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .../bq_dataframes_llm_kmeans.ipynb | 1064 +---------------- 1 file changed, 33 insertions(+), 1031 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index ae03813639..8d75950925 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -139,17 +139,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Updated property [core/project].\n" - ] - } - ], + "outputs": [], "source": [ "# set your project ID below\n", "PROJECT_ID = \"\" # @param {type:\"string\"}\n", @@ -170,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -264,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "R7STCS8xB5d2" }, @@ -296,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "zDSwoBo1CU3G" }, @@ -307,101 +299,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "id": "tYDoaKgJChiq" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9f096761-e3b5-4d58-a9f7-485ced67afca is DONE. 2.3 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ee8fecb1-2e30-407d-9e2e-9e76061da9e7 is DONE. 2.3 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
consumer_complaint_narrative
0I signed a contract as a condition of employme...
1First, I want to disclose that XXXX and XXXX b...
2Frequent calls from Focused Receivables Manage...
3I recently contacted Enhanced Recovery Company...
4This began when I subscribed to XXXX XXXX inte...
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " consumer_complaint_narrative\n", - "0 I signed a contract as a condition of employme...\n", - "1 First, I want to disclose that XXXX and XXXX b...\n", - "2 Frequent calls from Focused Receivables Manage...\n", - "3 I recently contacted Enhanced Recovery Company...\n", - "4 This began when I subscribed to XXXX XXXX inte...\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "issues_df = input_df[[\"consumer_complaint_narrative\"]].dropna()\n", "issues_df.head(n=5) # View the first five complaints" @@ -417,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "id": "OltYSUEcsSOW" }, @@ -439,24 +341,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "id": "li38q8FzDDMu" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 52d2e961-7896-497c-8b03-ab7374737679 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from bigframes.ml.llm import PaLM2TextEmbeddingGenerator\n", "\n", @@ -465,125 +354,11 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "id": "cOuSOQ5FDewD" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d093d51a-8eda-442f-80cd-568cb76e00b3 is DONE. 10.6 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 6419df65-3e96-41a7-a7b5-3d058e18763a is DONE. 80.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 917f09ea-c468-4363-a856-b1091e5f775f is DONE. 80.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5c9679e7-192c-40b5-a14b-edc0fa113eaa is DONE. 61.5 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
text_embedding
422[-0.012013785541057587, 0.003669967409223318, ...
616[-0.014948881231248379, -0.04672442376613617, ...
833[-0.01951478235423565, -0.027120858430862427, ...
1370[-0.03140445053577423, -0.048797041177749634, ...
1430[-0.02244548313319683, -0.03336532413959503, 0...
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " text_embedding\n", - "422 [-0.012013785541057587, 0.003669967409223318, ...\n", - "616 [-0.014948881231248379, -0.04672442376613617, ...\n", - "833 [-0.01951478235423565, -0.027120858430862427, ...\n", - "1370 [-0.03140445053577423, -0.048797041177749634, ...\n", - "1430 [-0.02244548313319683, -0.03336532413959503, 0...\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Will take ~3 minutes to compute the embeddings\n", "predicted_embeddings = model.predict(downsampled_issues_df)\n", @@ -593,263 +368,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "id": "4H_etYfsEOFP" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ce9cb0f9-4b0d-40a1-81f3-d6e60dd6c684 is DONE. 160.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job aa692a30-5706-46ad-8029-faf2fac66234 is DONE. 72.2 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
consumer_complaint_narrativetext_embedding
2580664Hello, my name is XXXX XXXX, and I am writing ...[0.0003211698785889894, -0.01816680282354355, ...
1806973This is XXXX XXXX and I am submitting this com...[-0.009485247544944286, -0.025846892967820168,...
2055053XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX...[-0.010950954630970955, -0.0249345600605011, 0...
2515231When I reinvestigated my credit report, I real...[-0.009660656563937664, -0.05793113633990288, ...
2633049Checking my credit report XX/XX/2018 with all ...[-0.0022159104701131582, -0.03330004960298538,...
3117273I contacted TransUnion and spoke a credit rep ...[-0.015955328941345215, -0.006488671060651541,...
698814XXXX XXXX XXXX. makes daily calls to me cell c...[0.005397460889071226, -0.01276913657784462, 0...
267826Can we please reopen Case : XXXX? \n", - "\n", - "Wells Farg...[0.004065403249114752, -0.0005381882656365633,...
54019My rights under 15 USC 1681 have been violated...[0.013823015615344048, -0.02010691538453102, 0...
141050To whom it may concern : My personal informati...[0.008104532025754452, -0.01856449618935585, 0...
2962076I have had a CashApp account since last year, ...[-0.0003019514260813594, -0.03750108182430267,...
2481105that some of the information was erroneous. Th...[-0.014868081547319889, -0.0443895161151886, -...
431562I have disputed the referenced accounts to the...[-0.0020524838473647833, -0.04830990731716156,...
1953029On, XX/XX/22, I attempted to complete a transa...[-0.01599179394543171, -0.0074900356121361256,...
2395979Subject : XXXX XXXX XXXX compensation, refund,...[-0.0035950862802565098, -0.014652969315648079...
455524I paid off my mortgage on XX/XX/2019. The comp...[-0.01100730150938034, -0.03495829552412033, 0...
2155924This kind of account is placed as a charged of...[-0.028635455295443535, -0.028604287654161453,...
1069497This is one of many issues I have had with Wel...[0.008871790021657944, -0.028502725064754486, ...
3181689I have disputed this account with MONTEREY FIN...[-0.004721717908978462, -0.03673810139298439, ...
274268Lender is not updating my loan status in the V...[-0.009221495129168034, -0.0289347805082798, 0...
1671305XXXX is a peer to peer lending conmpany that u...[-0.02911308966577053, -0.01850792020559311, -...
886026( DISPUTE CODE - XXXX ) My personal informatio...[-0.007220877334475517, -0.016615957021713257,...
1044431I filed a complaint against PNC this year and ...[0.002848619595170021, -0.035117778927087784, ...
1938481I applied for a modification and was approved....[-0.03114932030439377, -0.0421406552195549, 0....
1987834Ive been Disputting my XXXX XXXX I opened this...[-0.009406660683453083, -0.020967338234186172,...
\n", - "

25 rows × 2 columns

\n", - "
[10000 rows x 2 columns in total]" - ], - "text/plain": [ - " consumer_complaint_narrative \\\n", - "2580664 Hello, my name is XXXX XXXX, and I am writing ... \n", - "1806973 This is XXXX XXXX and I am submitting this com... \n", - "2055053 XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... \n", - "2515231 When I reinvestigated my credit report, I real... \n", - "2633049 Checking my credit report XX/XX/2018 with all ... \n", - "3117273 I contacted TransUnion and spoke a credit rep ... \n", - "698814 XXXX XXXX XXXX. makes daily calls to me cell c... \n", - "267826 Can we please reopen Case : XXXX? \n", - "\n", - "Wells Farg... \n", - "54019 My rights under 15 USC 1681 have been violated... \n", - "141050 To whom it may concern : My personal informati... \n", - "2962076 I have had a CashApp account since last year, ... \n", - "2481105 that some of the information was erroneous. Th... \n", - "431562 I have disputed the referenced accounts to the... \n", - "1953029 On, XX/XX/22, I attempted to complete a transa... \n", - "2395979 Subject : XXXX XXXX XXXX compensation, refund,... \n", - "455524 I paid off my mortgage on XX/XX/2019. The comp... \n", - "2155924 This kind of account is placed as a charged of... \n", - "1069497 This is one of many issues I have had with Wel... \n", - "3181689 I have disputed this account with MONTEREY FIN... \n", - "274268 Lender is not updating my loan status in the V... \n", - "1671305 XXXX is a peer to peer lending conmpany that u... \n", - "886026 ( DISPUTE CODE - XXXX ) My personal informatio... \n", - "1044431 I filed a complaint against PNC this year and ... \n", - "1938481 I applied for a modification and was approved.... \n", - "1987834 Ive been Disputting my XXXX XXXX I opened this... \n", - "\n", - " text_embedding \n", - "2580664 [0.0003211698785889894, -0.01816680282354355, ... \n", - "1806973 [-0.009485247544944286, -0.025846892967820168,... \n", - "2055053 [-0.010950954630970955, -0.0249345600605011, 0... \n", - "2515231 [-0.009660656563937664, -0.05793113633990288, ... \n", - "2633049 [-0.0022159104701131582, -0.03330004960298538,... \n", - "3117273 [-0.015955328941345215, -0.006488671060651541,... \n", - "698814 [0.005397460889071226, -0.01276913657784462, 0... \n", - "267826 [0.004065403249114752, -0.0005381882656365633,... \n", - "54019 [0.013823015615344048, -0.02010691538453102, 0... \n", - "141050 [0.008104532025754452, -0.01856449618935585, 0... \n", - "2962076 [-0.0003019514260813594, -0.03750108182430267,... \n", - "2481105 [-0.014868081547319889, -0.0443895161151886, -... \n", - "431562 [-0.0020524838473647833, -0.04830990731716156,... \n", - "1953029 [-0.01599179394543171, -0.0074900356121361256,... \n", - "2395979 [-0.0035950862802565098, -0.014652969315648079... \n", - "455524 [-0.01100730150938034, -0.03495829552412033, 0... \n", - "2155924 [-0.028635455295443535, -0.028604287654161453,... \n", - "1069497 [0.008871790021657944, -0.028502725064754486, ... \n", - "3181689 [-0.004721717908978462, -0.03673810139298439, ... \n", - "274268 [-0.009221495129168034, -0.0289347805082798, 0... \n", - "1671305 [-0.02911308966577053, -0.01850792020559311, -... \n", - "886026 [-0.007220877334475517, -0.016615957021713257,... \n", - "1044431 [0.002848619595170021, -0.035117778927087784, ... \n", - "1938481 [-0.03114932030439377, -0.0421406552195549, 0.... \n", - "1987834 [-0.009406660683453083, -0.020967338234186172,... \n", - "...\n", - "\n", - "[10000 rows x 2 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Join the complaints with their embeddings in the same DataFrame\n", - "combined_df = downsampled_issues_df.join(predicted_embeddings, how=\"left\")\n", - "combined_df" + "combined_df = downsampled_issues_df.join(predicted_embeddings)" ] }, { @@ -872,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": { "id": "AhNTnEC5FRz2" }, @@ -893,152 +419,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": { "id": "6poSxh-fGJF7" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 65eb317d-59f1-4d10-acd1-4b7f3778114c is DONE. 61.7 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 156e445e-cc01-4b30-84cc-ac1c98a69b81 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5befc212-f4a3-4e33-b1b2-01e809acdcbd is DONE. 61.9 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job bd271178-8b8d-45dc-ac57-7f0194d0daac is DONE. 80.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job bbfb9cca-622d-4bf5-9fc0-6d9a85287d41 is DONE. 80.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job a5f30b32-9fb0-42b4-b426-d8484f008bdb is DONE. 160.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CENTROID_ID
4222
6163
8335
13707
14303
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " CENTROID_ID\n", - "422 2\n", - "616 3\n", - "833 5\n", - "1370 7\n", - "1430 3\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(combined_df[\"text_embedding\"])\n", + "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", @@ -1047,123 +435,13 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 7a41196e-ea67-44ac-95a7-7dce620d6d21 is DONE. 320.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8008b482-1a0d-461f-a215-4676d9d918dc is DONE. 72.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
consumer_complaint_narrativetext_embeddingCENTROID_ID
2580664Hello, my name is XXXX XXXX, and I am writing ...[0.0003211698785889894, -0.01816680282354355, ...2
1806973This is XXXX XXXX and I am submitting this com...[-0.009485247544944286, -0.025846892967820168,...5
2055053XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX...[-0.010950954630970955, -0.0249345600605011, 0...3
2515231When I reinvestigated my credit report, I real...[-0.009660656563937664, -0.05793113633990288, ...5
2633049Checking my credit report XX/XX/2018 with all ...[-0.0022159104701131582, -0.03330004960298538,...3
\n", - "

5 rows × 3 columns

\n", - "
[5 rows x 3 columns in total]" - ], - "text/plain": [ - " consumer_complaint_narrative \\\n", - "2580664 Hello, my name is XXXX XXXX, and I am writing ... \n", - "1806973 This is XXXX XXXX and I am submitting this com... \n", - "2055053 XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... \n", - "2515231 When I reinvestigated my credit report, I real... \n", - "2633049 Checking my credit report XX/XX/2018 with all ... \n", - "\n", - " text_embedding CENTROID_ID \n", - "2580664 [0.0003211698785889894, -0.01816680282354355, ... 2 \n", - "1806973 [-0.009485247544944286, -0.025846892967820168,... 5 \n", - "2055053 [-0.010950954630970955, -0.0249345600605011, 0... 3 \n", - "2515231 [-0.009660656563937664, -0.05793113633990288, ... 5 \n", - "2633049 [-0.0022159104701131582, -0.03330004960298538,... 3 \n", - "\n", - "[5 rows x 3 columns]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Join the group number to the complaints and their text embeddings\n", "combined_clustered_result = combined_df.join(clustered_result)\n", - "\n", - "combined_clustered_result.head(n=5)" + "combined_clustered_result.head(n=5) " ] }, { @@ -1194,36 +472,11 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": { "id": "2E7wXM_jGqo6" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 50c7c0dd-94a2-494e-a37f-6a838a518f6c is DONE. 11.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d96c847f-c292-4804-bd05-fd643c41c7a5 is DONE. 11.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Using bigframes, with syntax identical to pandas,\n", "# filter out the first and second groups\n", @@ -1240,100 +493,11 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": { "id": "ZNDiueI9IP5e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "comment list 1:\n", - "1. XXXX is a peer to peer lending conmpany that uses borrowers crypto to collateralize loans from investors ( like myself ). I've been investing with them for almost XXXX years and currently have {$240000.00} tied up in lending products with XXXX. \n", - "As of XXXX days ago we received an email saying all business operations have been ceased and no withdrawals or deposits will be allowed. They said they'll update customers within 10 days, but no one can reach anyone at the company to find out any more details as they are not answering calls nor returning emails. It also appears the company has scrubbed its XXXX page and the XXXX pages of top executives. \n", - "\n", - "All collateral and client 's investment funds are supposedly held at or processed through XXXX XXXX XXXX ( registered SEC company ). XXXX XXXX keeps telling us to contact XXXX and won't give us any information, so we have no way to find out what's happening with our funds/collateral or if everything is gone. We have a XXXX channel up where people are gathering evidence, documentation, etc. This is probably the best place to start to get a broad view of what's happening. Details below. \n", - "\n", - "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX CONST LLC ( Business ID : XXXX ) FoXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 'Cease of Operations ' email received by all investors XXXX XX/XX/2022 at XXXX : \" Dear XXXX Users, Given the collapses of several cryptocurrencies so far this year and the rapidly deteriorating market conditions that have been prompting heavy withdrawals across all XXXX lending and XXXX exchange platforms recently, we are sad to inform you that we are unable to continue to operate our business as usual. As such, we are limiting our business activities, including pausing user withdrawals as allowed under our Terms of XXXX. \n", - "No deposit or investment request will be processed at this time. \n", - "\n", - "Our team is working diligently towards our objective of maximizing value for all of our Users, and our top priority continues to be to protect your interests. As we explore all options available to us, we will provide updates to you as we go. \n", - "\n", - "We hope to communicate with you within the next XXXX business days on the next steps to address the situation. We appreciate your patience in this trying time. \n", - "\n", - "Sincerely yoursXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", - "2. Submitted XX/XX/XXXX\n", - "Typed XX/XX/XXXX:\n", - "\n", - "XX/XX/XXXX\n", - "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX XXXX\n", - "PH:. XXXX\n", - "PH: XXXX\n", - "EM:\n", - "XXXX\n", - "XXXX\n", - "XXXX XXXX \n", - "XXXX XXXX\n", - "Date of Birth XX/XX/XXXX\n", - "SS#: XXXX\n", - "TO:\n", - "* Consumer Financial Protection Brueau\n", - "* Department of Veteran Affairs, Office of the Inspector General\n", - "My name is XXXX XXXX XXXX, I've received more than one email from Discover Card in my XXXX XXXX, past emails from Discover Card were unautherized deletions.\n", - "From: Discover Card XXXX\n", - "To: You XXXX\n", - "Date: XX/XX/XXXX, XXXX XXXX XXXX From: Discover Card XXXX>\n", - "To Recipient \n", - "Date Mon, XX/XX/XXXX XXXX XXXX\n", - "I dont and havent ever had a Discover Checking, Savings, Business Accounts nor Loans of any kind through any Bank called Discover. The 1st time I was contacted by Discover Card I resided alone from XX/XX/XXXX to XX/XX/XXXXat XXXX XXXX XXXX at XXXX XXXX XXXX XXXX XXXX in XXXX, XXXX years prior to me moving here to XXXX, XXXX in XX/XX/XXXX. When \n", - "\n", - "\n", - "Discover Card had 1st contacted me in XXXX, XXXX it was associated with my XXXX XXXX XXXX website related online Merchants Account. Not once have I ever applied for or had any Website Merchant Accounts here in XXXX; I only applied for online online Merchant Accounts associated with my XXXX related Accounts I purchased while residing in XXXX, XXXX. Some of my website related information was stolen both in XXXX, XXXX and here in XXXX along with my other property that hasn't been returned to me. I don't and haven't ever had any XXXX XXXX related Agreements,Contracts or Credit Cards offered to Veterans associated with ones businesses. Nor have I ever applied for or had a Business License or Business Permit in any City or State inspite of my diverse interest. Not once have I ever allowed another be it an Paralegal, Payee, Attorney, Employers, Landlords, Veteran Organizations including Vocational Rehabilitation Programs, XXXX( XXXX XXXX XXXX, XXXX XXXX, Entertainment Companies, Banks, Celebrity Personal Assistant Agencies or Celebs, Shelters, Charities, HUD, Housing Arthority, Department of Veteran Affairs, Military, Law Enforcement or anyone else nor their employess to sign any business related Agreements or Contracts on my behalf; not even my family members or friends. \n", - "None of my XXXX XXXX attempts were associated with my Employers, Department of Veteran Affairs,Vocational Rehabilitation Programs Military, Landlords, HUD( Housing Authority),Friends, Family nor did I ever sign related Agreements or Contracts with them. Not once had I ever provided anyone the passwords to be able to sign into my accounts rather were aware of my accounts or not. Yes, my desktop computer that was stolen along with my other property XX/XX/XXXX was registered with my Online Merchant Account. I had paid for my Merchant related Accounts through my same XXXX XXXX XXXX Account I purchased both of my XXXX XXXX XXXX related accounts through. That was 1st once during the Summer of XX/XX/XXXX and 2nd my related website months later, while I resided in XXXX XXXX and I worked for XXXX. I never offered nor did I ever sign any business Contracts or Agreements with XXXX nor my Landlord or their staff associted with any of my online websites or Merchant Accounts. My XXXX XXXX XXXX Compensation was deposited into both of my XXXX XXXX XXXX Accounts at that time. My account was changed during the Summer of XX/XX/XXXXbecause of theft of my Bank Card. None of my Checking,Savings, past Credit Cards or Business related were shared accounts in which others were allowed to \n", - "use to make purchases. I had written checks from my XXXX XXXX XXXX account to pay for my XXXX XXXX XXXX XXXX on the XXXX XXXX here in XXXX in XX/XX/XXXX before it's name changed to XXXX XXXX. Prior to me using my same account open a Checking account in person at XXXX XXXX before it's name was changed to XXXX XXXX. Where my XXXX XXXX XXXX XXXX has been deposited since that time. I had used my XXXX XXXX Checking to pay for my XXXX XXXX XXXX XXXX both before theft of my property XX/XX/XXXX and that was also prior to the theft of my property from my XXXX XXXX XXXX XXXX in XX/XX/XXXX.\n", - "I've stated this many times:\n", - "I paid for my 1st XXXX XXXX XXXX Membership while employed at XXXX using my XXXX XXXX XXXX account XXXX my XXXX XXXX XXXX XXXX was also deposited. That was changed to XXXX because I didn't receive my 1st XXXX XXXX XXXX Card the bank sent to XXXX XXXX residence on XXXX XXXX in XX/XX/XXXX while I was there. In which both my XXXX salary and XXXX XXXX XXXX XXXX were deposited into my account, no money from XXXX XXXX nor anyone else that was at that residence was given to nor were any of my children there. Nor did XXXX or any other person at that residence ever give me my missing Bank Card not even after I moved out and stayed a month at XXXX XXXX XXXX using my replacement card to pay for my Hotel room. Which is the same account I used to pay for XXXX XXXX Membership, XXXX XXXX XXXX, XXXX XXXX Membership fees, and various online Merchant Account activation related fees.\n", - "* XXXX XXXX XXXX.\n", - "XXXX XXXX XXXX XXXX. Membership\n", - "\n", - "# XXXX\n", - "* XXXX XXXX Membership\n", - "# XXXX\n", - "* Total Merchant Services XXXX and XXXX.\n", - "* XXXX XXXX XXXX XXXX XXXX\n", - "* XXXX XXXX changed my $XXXX a month fees to my XXXX XXXX XXXX account #XXXX.\n", - "XX/XX/XXXX - XX/XX/XXXX XXXX XXXX, XXXX.\n", - "\n", - "Rep: XXXX XXXX XXXX, Fl \n", - "XXXX\n", - "XXXX Website \n", - "XXXX\n", - "Software and website owner, I performed Internet advertising and marketing, to promote this software and website. I worked and XXXX from my home XXXX XXXX XXXX XXXX XXXX , XXXX. I purchased XXXX XXXX XXXX-Software Electronic Book CD and was given a website to promote the software on the internet. The XXXX was given a copy of my website owner certificate document submitted to me when I purchased the software marketing program as well copies of my other school transcripts in addition to XXXX XXXX XXXX for example. XXXX, represented the first initials of my children's names. I wasn't ever paid and I'm still owed the money. Nor did my marketing program have anything to do with any schools, college nor university programs nor did I ever offer or sign any agreement to include it such. Nor did my XXXX XXXX XXXX have anything to do with any other employers, Department of Family and Children, Military, Veteran Organizations or Food Stamp programs, Section 8 nor Indianapolis Housing Authority for example; only me.\n", - "Thank you,\n", - "XXXX XXXX\n", - "3. ACCORDING TO 15 U.S. CODE 6803-DISCLOSURE OF INSTITUTION PRIVACY POLICY, AND ACCORDING TO U.S. CODE 6802- OBLIGATIONS WITH RESPECT TO DISCLOSURES OF PERSONAL INFORMATION. ( b ) OPT OUT ( 1 ) IN GENERAL A FINANCIAL INSTITUTION MAY NOT DISCLOSE NONPUBLIC PERSONAL INFORMATION TO A NONAFFILIATED THIRD PARTY ( TRANSUNION, XXXX, AND XXXX. ) UNLESS- ( A ) SUCH FINANCIAL INSTITUTION CLEARLY AND CONSPICUOUSLY DISCLOSES TO THE CONSUMER, IN WRITING OR IN ELECTRONIC FORM OR OTHER FORM PERMITTED BY THE REGULATIONS PRESCRIBED UNDER SECTION 6804 OF THIS TITLE. ALSO ACCORDING TO THE \" XXXX ACT '', FINANCIAL INSTITUTIONS MUST TELL THEIR CUSTOMERS ABOUT THEIR INFORMATION-SHARING PRACTICES AND EXPLAIN TO CUSTOMERS THEIR RIGHT TO \" OPT OUT '' IF THEY DON'T WANT THEIR INFORMATION SHARED WITH CERTAIN THIRD PARTIES. UNDER THE FDCPA, A COLLECTOR MUST PROVIDE YOU WITH INFORMATION ABOUT THE DEBT IN ITS INITIAL COMMUNICATION OR WITHIN FIVE DAYS AFTER THE INITIAL COMMUNICATION. ALSO, THE FDCPA STATES, \" YOU CAN NOT ATTEMPT TO COLLECT AN DEBT WHILE A PERSON ( THE CONSUMER ) SUPRESS VALIDATION. TRANSUNION, XXXX, XXXX, AND THE ACCOUNTS LISTED BELOW HAVE CLEARLY VIOLATED MY RIGHTS : XXXX ACCOUNT # XXXX, XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX ACCOUNT # XXXX, XXXX XXXX XXXX XXXX ACCOUNT # XXXX, AND XXXX ACCOUNT # XXXX. FAILURE TO RESPOND SATISFACTORILY WITH DELETIONS OF ALL THE ABOVE ACCOUNTS WILL RESULT IN LEGAL ACTIONS BEING TAKEN AGAINST, TRANSUNION, XXXX, XXXX, WHICH I'LL BE SEEKING A {$1000.00} PER VIOLATION FOR DEFAMATION OF CHARACTER ( PER SE ) NEGLIGENT ENABLEMENT OF IDENTITY FRAUD. 15 USC 1681 VIOLATIONS FOR WILLFUL NONCOMPLIANCE-616 CIVIL LIABILITY FOR WILLFUL NONCOPLIANCE. THIS IS THE THIRD TIME I'VE SUBMITTED A COMPLAINT, AND THE REPONSE I GET IS \" YOU CAN NOT LOCATE MY CREDIT REPORT! '' THIS IS CLEARLY NEGLIGENCE.\n", - "4. I do not know how this works, but I need it done or somehow corrected. My name is XXXX XXXX, XXXX XXXX XXXX XXXX TN XXXXMy SS XXXX DOB XXXX. I had some issues with my income being affected by the COVID-19PANDEMICSHUTDOWN. I was under the 1 CARESAct, Pub. L. 116-136, section 4021, codified at FCRAsection 623 ( a ) ( 1 ) ( F ) ( i ) ( I ), 15 U.S.C.1681s- 2 ( a ) ( 1 ) ( F ) ( i ) ( I ). I am requesting some accommodations so I care to protect the integrity of my credit file. US DEPT OF ED / XXXX # XXXX, # XXXX accounts are reporting on XXXX, XXXX The was 30,60, 90 DAYS LATEsince requested assistance due to the pandemic. I found a few accounts that I have never done any business with these companies and the accounts do not belong on my report : XXXX XXXX # XXXX, XXXX XXXX XXXX XXXX # XXXX. \n", - "\n", - "I have some issues with the misspelling of my name, my correct spelling is XXXX XXXX. Please remove any other variation of my name they are not correct. The following addresses do not belong to me please delete them : XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXXSC, XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", - "5. I want to know if this is even legal?! How can they disclose information without knowing its a correct email?!\n", - "\n", - "comment list 2:\n", - "1. Hello, my name is XXXX XXXX, and I am writing to delete the following information in my file. The items I need deleted are listed in the report. I am a victim of identity theft and did not make the charge. I ask that the items be deleted to correct my credit report. I reported the theft of my identity to the Federal Trade Commission and I also have enclosed copies of the Federal Trade Commissions Identity Theft Affidavit. Please delete the items as soon as possible. The accounts are being reported currently open and the accounts need to be closed. \n", - "XXXX account number XXXX opened on XX/XX/2022 for the amount {$530.00} XXXX XXXX XXXX account number XXXX opened on XX/XX/2022 for the amount of {$140.00} The accounts are being reported currently open and need to be closed immediately. \n", - "Based on, 15 U.S. Code 1681c2 a consumer reporting agency shall block the reporting of any information in the file of a consumer that the consumer identifies as information that resulted from an alleged identity theft, not later than 4 business days after the date of receipt. This account should not be furnished on my consumer report. As a consumer I am demanding the deletion of the accounts listed IMMEDIATELY.\n", - "2. To whom it may concern : My personal information was breach in the internet as result accounts had been open in my name, I was advise to fill out an Id theft report to help me deal with this situation, I have listed each one of the accounts that do not belong to me. This is my second request to remove unverified items in my report, but XXXX keep rposting these account with out providing any type of original document as the FCRA provide, you need to provide me with original documents or remove these account immediately.\n", - "3. Ive been Disputting my XXXX XXXX I opened this account and someone got my information and used my card, I contacted XXXX over and over, they removed the negative reporting from my XXXX report but still reporting it negative on my XXXX and Expean this is very unfair to me because Im a victim of identity theft\n", - "4. Today, XX/XX/2021, I received three items in the mail, one envelope containing an unsolicited debit card from Navy Federal credit Union and the other two, with a letter each describing The Important Rights on two accounts should these accounts become delinquent under New York law. \n", - "\n", - "First of all, I never applied for these accounts with Navy Federal, not have I authorized anyone to do so on my behalf. I immediately contacted Navy Federal via phone and was told I was most likely a victim of identity theft and that I should monitor my credit and use a credit monitoring service. I was also asked for my email and mailing information in order to receive a letter from them regarding this issue. \n", - "\n", - "My main concern is having someone using my identity to illegally open bank accounts and commit fraud, destroying my credit and finances in the process. This bank is in another state from where I reside. I have not lived in Virginia nor do I intend to do so in the foreseeable future.\n", - "5. My personal information ( including my SSN, Drivers License Info, Addresses, and more ) was stolen from a hacking, and Equifax did n't tell the public about the hack until more than a month after the hacking. During this time, three Equifax executives were caught inside trading. It really shows how Equifax cares about other people!\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Build plain-text prompts to send to PaLM 2. Use only 5 complaints from each group.\n", "prompt1 = 'comment list 1:\\n'\n", @@ -1352,100 +516,11 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": { "id": "BfHGJLirzSvH" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please highlight the most obvious difference betweenthe two lists of comments:\n", - "comment list 1:\n", - "1. XXXX is a peer to peer lending conmpany that uses borrowers crypto to collateralize loans from investors ( like myself ). I've been investing with them for almost XXXX years and currently have {$240000.00} tied up in lending products with XXXX. \n", - "As of XXXX days ago we received an email saying all business operations have been ceased and no withdrawals or deposits will be allowed. They said they'll update customers within 10 days, but no one can reach anyone at the company to find out any more details as they are not answering calls nor returning emails. It also appears the company has scrubbed its XXXX page and the XXXX pages of top executives. \n", - "\n", - "All collateral and client 's investment funds are supposedly held at or processed through XXXX XXXX XXXX ( registered SEC company ). XXXX XXXX keeps telling us to contact XXXX and won't give us any information, so we have no way to find out what's happening with our funds/collateral or if everything is gone. We have a XXXX channel up where people are gathering evidence, documentation, etc. This is probably the best place to start to get a broad view of what's happening. Details below. \n", - "\n", - "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX CONST LLC ( Business ID : XXXX ) FoXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 'Cease of Operations ' email received by all investors XXXX XX/XX/2022 at XXXX : \" Dear XXXX Users, Given the collapses of several cryptocurrencies so far this year and the rapidly deteriorating market conditions that have been prompting heavy withdrawals across all XXXX lending and XXXX exchange platforms recently, we are sad to inform you that we are unable to continue to operate our business as usual. As such, we are limiting our business activities, including pausing user withdrawals as allowed under our Terms of XXXX. \n", - "No deposit or investment request will be processed at this time. \n", - "\n", - "Our team is working diligently towards our objective of maximizing value for all of our Users, and our top priority continues to be to protect your interests. As we explore all options available to us, we will provide updates to you as we go. \n", - "\n", - "We hope to communicate with you within the next XXXX business days on the next steps to address the situation. We appreciate your patience in this trying time. \n", - "\n", - "Sincerely yoursXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", - "2. Submitted XX/XX/XXXX\n", - "Typed XX/XX/XXXX:\n", - "\n", - "XX/XX/XXXX\n", - "XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX XXXX\n", - "PH:. XXXX\n", - "PH: XXXX\n", - "EM:\n", - "XXXX\n", - "XXXX\n", - "XXXX XXXX \n", - "XXXX XXXX\n", - "Date of Birth XX/XX/XXXX\n", - "SS#: XXXX\n", - "TO:\n", - "* Consumer Financial Protection Brueau\n", - "* Department of Veteran Affairs, Office of the Inspector General\n", - "My name is XXXX XXXX XXXX, I've received more than one email from Discover Card in my XXXX XXXX, past emails from Discover Card were unautherized deletions.\n", - "From: Discover Card XXXX\n", - "To: You XXXX\n", - "Date: XX/XX/XXXX, XXXX XXXX XXXX From: Discover Card XXXX>\n", - "To Recipient \n", - "Date Mon, XX/XX/XXXX XXXX XXXX\n", - "I dont and havent ever had a Discover Checking, Savings, Business Accounts nor Loans of any kind through any Bank called Discover. The 1st time I was contacted by Discover Card I resided alone from XX/XX/XXXX to XX/XX/XXXXat XXXX XXXX XXXX at XXXX XXXX XXXX XXXX XXXX in XXXX, XXXX years prior to me moving here to XXXX, XXXX in XX/XX/XXXX. When \n", - "\n", - "\n", - "Discover Card had 1st contacted me in XXXX, XXXX it was associated with my XXXX XXXX XXXX website related online Merchants Account. Not once have I ever applied for or had any Website Merchant Accounts here in XXXX; I only applied for online online Merchant Accounts associated with my XXXX related Accounts I purchased while residing in XXXX, XXXX. Some of my website related information was stolen both in XXXX, XXXX and here in XXXX along with my other property that hasn't been returned to me. I don't and haven't ever had any XXXX XXXX related Agreements,Contracts or Credit Cards offered to Veterans associated with ones businesses. Nor have I ever applied for or had a Business License or Business Permit in any City or State inspite of my diverse interest. Not once have I ever allowed another be it an Paralegal, Payee, Attorney, Employers, Landlords, Veteran Organizations including Vocational Rehabilitation Programs, XXXX( XXXX XXXX XXXX, XXXX XXXX, Entertainment Companies, Banks, Celebrity Personal Assistant Agencies or Celebs, Shelters, Charities, HUD, Housing Arthority, Department of Veteran Affairs, Military, Law Enforcement or anyone else nor their employess to sign any business related Agreements or Contracts on my behalf; not even my family members or friends. \n", - "None of my XXXX XXXX attempts were associated with my Employers, Department of Veteran Affairs,Vocational Rehabilitation Programs Military, Landlords, HUD( Housing Authority),Friends, Family nor did I ever sign related Agreements or Contracts with them. Not once had I ever provided anyone the passwords to be able to sign into my accounts rather were aware of my accounts or not. Yes, my desktop computer that was stolen along with my other property XX/XX/XXXX was registered with my Online Merchant Account. I had paid for my Merchant related Accounts through my same XXXX XXXX XXXX Account I purchased both of my XXXX XXXX XXXX related accounts through. That was 1st once during the Summer of XX/XX/XXXX and 2nd my related website months later, while I resided in XXXX XXXX and I worked for XXXX. I never offered nor did I ever sign any business Contracts or Agreements with XXXX nor my Landlord or their staff associted with any of my online websites or Merchant Accounts. My XXXX XXXX XXXX Compensation was deposited into both of my XXXX XXXX XXXX Accounts at that time. My account was changed during the Summer of XX/XX/XXXXbecause of theft of my Bank Card. None of my Checking,Savings, past Credit Cards or Business related were shared accounts in which others were allowed to \n", - "use to make purchases. I had written checks from my XXXX XXXX XXXX account to pay for my XXXX XXXX XXXX XXXX on the XXXX XXXX here in XXXX in XX/XX/XXXX before it's name changed to XXXX XXXX. Prior to me using my same account open a Checking account in person at XXXX XXXX before it's name was changed to XXXX XXXX. Where my XXXX XXXX XXXX XXXX has been deposited since that time. I had used my XXXX XXXX Checking to pay for my XXXX XXXX XXXX XXXX both before theft of my property XX/XX/XXXX and that was also prior to the theft of my property from my XXXX XXXX XXXX XXXX in XX/XX/XXXX.\n", - "I've stated this many times:\n", - "I paid for my 1st XXXX XXXX XXXX Membership while employed at XXXX using my XXXX XXXX XXXX account XXXX my XXXX XXXX XXXX XXXX was also deposited. That was changed to XXXX because I didn't receive my 1st XXXX XXXX XXXX Card the bank sent to XXXX XXXX residence on XXXX XXXX in XX/XX/XXXX while I was there. In which both my XXXX salary and XXXX XXXX XXXX XXXX were deposited into my account, no money from XXXX XXXX nor anyone else that was at that residence was given to nor were any of my children there. Nor did XXXX or any other person at that residence ever give me my missing Bank Card not even after I moved out and stayed a month at XXXX XXXX XXXX using my replacement card to pay for my Hotel room. Which is the same account I used to pay for XXXX XXXX Membership, XXXX XXXX XXXX, XXXX XXXX Membership fees, and various online Merchant Account activation related fees.\n", - "* XXXX XXXX XXXX.\n", - "XXXX XXXX XXXX XXXX. Membership\n", - "\n", - "# XXXX\n", - "* XXXX XXXX Membership\n", - "# XXXX\n", - "* Total Merchant Services XXXX and XXXX.\n", - "* XXXX XXXX XXXX XXXX XXXX\n", - "* XXXX XXXX changed my $XXXX a month fees to my XXXX XXXX XXXX account #XXXX.\n", - "XX/XX/XXXX - XX/XX/XXXX XXXX XXXX, XXXX.\n", - "\n", - "Rep: XXXX XXXX XXXX, Fl \n", - "XXXX\n", - "XXXX Website \n", - "XXXX\n", - "Software and website owner, I performed Internet advertising and marketing, to promote this software and website. I worked and XXXX from my home XXXX XXXX XXXX XXXX XXXX , XXXX. I purchased XXXX XXXX XXXX-Software Electronic Book CD and was given a website to promote the software on the internet. The XXXX was given a copy of my website owner certificate document submitted to me when I purchased the software marketing program as well copies of my other school transcripts in addition to XXXX XXXX XXXX for example. XXXX, represented the first initials of my children's names. I wasn't ever paid and I'm still owed the money. Nor did my marketing program have anything to do with any schools, college nor university programs nor did I ever offer or sign any agreement to include it such. Nor did my XXXX XXXX XXXX have anything to do with any other employers, Department of Family and Children, Military, Veteran Organizations or Food Stamp programs, Section 8 nor Indianapolis Housing Authority for example; only me.\n", - "Thank you,\n", - "XXXX XXXX\n", - "3. ACCORDING TO 15 U.S. CODE 6803-DISCLOSURE OF INSTITUTION PRIVACY POLICY, AND ACCORDING TO U.S. CODE 6802- OBLIGATIONS WITH RESPECT TO DISCLOSURES OF PERSONAL INFORMATION. ( b ) OPT OUT ( 1 ) IN GENERAL A FINANCIAL INSTITUTION MAY NOT DISCLOSE NONPUBLIC PERSONAL INFORMATION TO A NONAFFILIATED THIRD PARTY ( TRANSUNION, XXXX, AND XXXX. ) UNLESS- ( A ) SUCH FINANCIAL INSTITUTION CLEARLY AND CONSPICUOUSLY DISCLOSES TO THE CONSUMER, IN WRITING OR IN ELECTRONIC FORM OR OTHER FORM PERMITTED BY THE REGULATIONS PRESCRIBED UNDER SECTION 6804 OF THIS TITLE. ALSO ACCORDING TO THE \" XXXX ACT '', FINANCIAL INSTITUTIONS MUST TELL THEIR CUSTOMERS ABOUT THEIR INFORMATION-SHARING PRACTICES AND EXPLAIN TO CUSTOMERS THEIR RIGHT TO \" OPT OUT '' IF THEY DON'T WANT THEIR INFORMATION SHARED WITH CERTAIN THIRD PARTIES. UNDER THE FDCPA, A COLLECTOR MUST PROVIDE YOU WITH INFORMATION ABOUT THE DEBT IN ITS INITIAL COMMUNICATION OR WITHIN FIVE DAYS AFTER THE INITIAL COMMUNICATION. ALSO, THE FDCPA STATES, \" YOU CAN NOT ATTEMPT TO COLLECT AN DEBT WHILE A PERSON ( THE CONSUMER ) SUPRESS VALIDATION. TRANSUNION, XXXX, XXXX, AND THE ACCOUNTS LISTED BELOW HAVE CLEARLY VIOLATED MY RIGHTS : XXXX ACCOUNT # XXXX, XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX XXXX ACCOUNT # XXXXXXXX XXXX XXXX XXXX ACCOUNT # XXXX, XXXX XXXX XXXX XXXX ACCOUNT # XXXX, AND XXXX ACCOUNT # XXXX. FAILURE TO RESPOND SATISFACTORILY WITH DELETIONS OF ALL THE ABOVE ACCOUNTS WILL RESULT IN LEGAL ACTIONS BEING TAKEN AGAINST, TRANSUNION, XXXX, XXXX, WHICH I'LL BE SEEKING A {$1000.00} PER VIOLATION FOR DEFAMATION OF CHARACTER ( PER SE ) NEGLIGENT ENABLEMENT OF IDENTITY FRAUD. 15 USC 1681 VIOLATIONS FOR WILLFUL NONCOMPLIANCE-616 CIVIL LIABILITY FOR WILLFUL NONCOPLIANCE. THIS IS THE THIRD TIME I'VE SUBMITTED A COMPLAINT, AND THE REPONSE I GET IS \" YOU CAN NOT LOCATE MY CREDIT REPORT! '' THIS IS CLEARLY NEGLIGENCE.\n", - "4. I do not know how this works, but I need it done or somehow corrected. My name is XXXX XXXX, XXXX XXXX XXXX XXXX TN XXXXMy SS XXXX DOB XXXX. I had some issues with my income being affected by the COVID-19PANDEMICSHUTDOWN. I was under the 1 CARESAct, Pub. L. 116-136, section 4021, codified at FCRAsection 623 ( a ) ( 1 ) ( F ) ( i ) ( I ), 15 U.S.C.1681s- 2 ( a ) ( 1 ) ( F ) ( i ) ( I ). I am requesting some accommodations so I care to protect the integrity of my credit file. US DEPT OF ED / XXXX # XXXX, # XXXX accounts are reporting on XXXX, XXXX The was 30,60, 90 DAYS LATEsince requested assistance due to the pandemic. I found a few accounts that I have never done any business with these companies and the accounts do not belong on my report : XXXX XXXX # XXXX, XXXX XXXX XXXX XXXX # XXXX. \n", - "\n", - "I have some issues with the misspelling of my name, my correct spelling is XXXX XXXX. Please remove any other variation of my name they are not correct. The following addresses do not belong to me please delete them : XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXXSC, XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX\n", - "5. I want to know if this is even legal?! How can they disclose information without knowing its a correct email?!\n", - "comment list 2:\n", - "1. Hello, my name is XXXX XXXX, and I am writing to delete the following information in my file. The items I need deleted are listed in the report. I am a victim of identity theft and did not make the charge. I ask that the items be deleted to correct my credit report. I reported the theft of my identity to the Federal Trade Commission and I also have enclosed copies of the Federal Trade Commissions Identity Theft Affidavit. Please delete the items as soon as possible. The accounts are being reported currently open and the accounts need to be closed. \n", - "XXXX account number XXXX opened on XX/XX/2022 for the amount {$530.00} XXXX XXXX XXXX account number XXXX opened on XX/XX/2022 for the amount of {$140.00} The accounts are being reported currently open and need to be closed immediately. \n", - "Based on, 15 U.S. Code 1681c2 a consumer reporting agency shall block the reporting of any information in the file of a consumer that the consumer identifies as information that resulted from an alleged identity theft, not later than 4 business days after the date of receipt. This account should not be furnished on my consumer report. As a consumer I am demanding the deletion of the accounts listed IMMEDIATELY.\n", - "2. To whom it may concern : My personal information was breach in the internet as result accounts had been open in my name, I was advise to fill out an Id theft report to help me deal with this situation, I have listed each one of the accounts that do not belong to me. This is my second request to remove unverified items in my report, but XXXX keep rposting these account with out providing any type of original document as the FCRA provide, you need to provide me with original documents or remove these account immediately.\n", - "3. Ive been Disputting my XXXX XXXX I opened this account and someone got my information and used my card, I contacted XXXX over and over, they removed the negative reporting from my XXXX report but still reporting it negative on my XXXX and Expean this is very unfair to me because Im a victim of identity theft\n", - "4. Today, XX/XX/2021, I received three items in the mail, one envelope containing an unsolicited debit card from Navy Federal credit Union and the other two, with a letter each describing The Important Rights on two accounts should these accounts become delinquent under New York law. \n", - "\n", - "First of all, I never applied for these accounts with Navy Federal, not have I authorized anyone to do so on my behalf. I immediately contacted Navy Federal via phone and was told I was most likely a victim of identity theft and that I should monitor my credit and use a credit monitoring service. I was also asked for my email and mailing information in order to receive a letter from them regarding this issue. \n", - "\n", - "My main concern is having someone using my identity to illegally open bank accounts and commit fraud, destroying my credit and finances in the process. This bank is in another state from where I reside. I have not lived in Virginia nor do I intend to do so in the foreseeable future.\n", - "5. My personal information ( including my SSN, Drivers License Info, Addresses, and more ) was stolen from a hacking, and Equifax did n't tell the public about the hack until more than a month after the hacking. During this time, three Equifax executives were caught inside trading. It really shows how Equifax cares about other people!\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# The plain English request we will make of PaLM 2\n", "prompt = (\n", @@ -1465,42 +540,20 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": { "id": "mL5P0_3X04dE" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 66e3af22-91cb-400a-92c3-69e7cd12ee01 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from bigframes.ml.llm import PaLM2TextGenerator\n", "\n", -<<<<<<< HEAD "q_a_model = PaLM2TextGenerator()" -======= - "# Create a BigQuery Cloud resource connection\n", - "CONN_NAME = \"bqdf-llm\"\n", - "session = bf.get_global_session()\n", - "\n", - "connection = f\"{PROJECT_ID}.{REGION}.{CONN_NAME}\"\n", - "q_a_model = PaLM2TextGenerator(session=session, connection_name=connection)" ->>>>>>> origin/lmm-kmeans-notebook ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": { "id": "ICWHsqAW1FNk" }, @@ -1512,58 +565,11 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": { "id": "gB7e1LXU1pst" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 653add17-29be-408c-8882-064217f8556e is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8fd16954-853a-45fd-80bc-65b1242429e2 is DONE. 8 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d9929bcb-26ce-4844-b68e-f4a980b90ede is DONE. 171 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "' The first comment list is about people complaining about companies or services, while the second comment list is about people reporting identity theft or fraud.'" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Send the request for PaLM 2 to generate a response to our prompt\n", "major_difference = q_a_model.predict(df)\n", @@ -1585,11 +591,7 @@ "source": [ "# Summary and next steps\n", "\n", -<<<<<<< HEAD "You've used the ML and LLM capabilities of BigQuery DataFrames to help analyze and understand a large dataset of unstructured feedback.\n", -======= - "You've used BigQuery DataFrames' integration with LLM models (`bigframes.ml.llm`) to generate code samples, and have tranformed LLM output by creating and using a custom function in BigQuery DataFrames.\n", ->>>>>>> origin/lmm-kmeans-notebook "\n", "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." ] From 81125f9505ad98e89939769a8e1fcf30518705f0 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:58:14 -0800 Subject: [PATCH 04/26] feat: send warnings on LLM prediction partial failures (#216) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/llm.py | 78 ++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 93e2ba825f..78f3369daf 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -17,6 +17,7 @@ from __future__ import annotations from typing import cast, Literal, Optional, Union +import warnings import bigframes from bigframes import clients, constants @@ -24,15 +25,22 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -_REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT = "text-bison" -_REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT = "text-bison-32k" -_TEXT_GENERATE_RESULT_COLUMN = "ml_generate_text_llm_result" +_TEXT_GENERATOR_BISON_ENDPOINT = "text-bison" +_TEXT_GENERATOR_BISON_32K_ENDPOINT = "text-bison-32k" +_TEXT_GENERATOR_ENDPOINTS = ( + _TEXT_GENERATOR_BISON_ENDPOINT, + _TEXT_GENERATOR_BISON_32K_ENDPOINT, +) -_REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT = "textembedding-gecko" -_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT = ( - "textembedding-gecko-multilingual" +_EMBEDDING_GENERATOR_GECKO_ENDPOINT = "textembedding-gecko" +_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT = "textembedding-gecko-multilingual" +_EMBEDDING_GENERATOR_ENDPOINTS = ( + _EMBEDDING_GENERATOR_GECKO_ENDPOINT, + _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, ) -_EMBED_TEXT_RESULT_COLUMN = "text_embedding" + +_ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" +_ML_EMBED_TEXT_STATUS = "ml_embed_text_status" class PaLM2TextGenerator(base.Predictor): @@ -90,18 +98,16 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - if self.model_name == _REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT: - options = { - "endpoint": _REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT, - } - elif self.model_name == _REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT: - options = { - "endpoint": _REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT, - } - else: + + if self.model_name not in _TEXT_GENERATOR_ENDPOINTS: raise ValueError( - f"Model name {self.model_name} is not supported. We only support {_REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT} and {_REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT}." + f"Model name {self.model_name} is not supported. We only support {', '.join(_TEXT_GENERATOR_ENDPOINTS)}." ) + + options = { + "endpoint": self.model_name, + } + return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -182,7 +188,16 @@ def predict( "top_p": top_p, "flatten_json_output": True, } - return self._bqml_model.generate_text(X, options) + + df = self._bqml_model.generate_text(X, options) + + if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df class PaLM2TextEmbeddingGenerator(base.Predictor): @@ -241,19 +256,15 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - if self.model_name == "textembedding-gecko": - options = { - "endpoint": _REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT, - } - elif self.model_name == _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT: - options = { - "endpoint": _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT, - } - else: + + if self.model_name not in _EMBEDDING_GENERATOR_ENDPOINTS: raise ValueError( - f"Model name {self.model_name} is not supported. We only support {_REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT} and {_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT}." + f"Model name {self.model_name} is not supported. We only support {', '.join(_EMBEDDING_GENERATOR_ENDPOINTS)}." ) + options = { + "endpoint": self.model_name, + } return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -284,4 +295,13 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: options = { "flatten_json_output": True, } - return self._bqml_model.generate_text_embedding(X, options) + + df = self._bqml_model.generate_text_embedding(X, options) + + if (df[_ML_EMBED_TEXT_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_EMBED_TEXT_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df From 52dfad281def82548751a276ce42b087dbb09f9a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 17 Nov 2023 21:42:14 +0000 Subject: [PATCH 05/26] docs: code samples for `Series.where` and `Series.mask` (#217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated - `Series.where`: https://screenshot.googleplex.com/9XWHpMnwrzVPF9G - `Series.mask`: https://screenshot.googleplex.com/4cPvvzoVaVzoCDD Fixes internal issue 310981880 🦕 --- .../bigframes_vendored/pandas/core/series.py | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c6d98075f5..01175dc0ef 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1696,6 +1696,49 @@ def kurt(self): def where(self, cond, other): """Replace values where the condition is False. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([10, 11, 12, 13, 14]) + >>> s + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: Int64 + + You can filter the values in the Series based on a condition. The values + matching the condition would be kept, and not matching would be replaced. + The default replacement value is ``NA``. + + >>> s.where(s % 2 == 0) + 0 10 + 1 + 2 12 + 3 + 4 14 + dtype: Int64 + + You can specify a custom replacement value for non-matching values. + + >>> s.where(s % 2 == 0, -1) + 0 10 + 1 -1 + 2 12 + 3 -1 + 4 14 + dtype: Int64 + >>> s.where(s % 2 == 0, 100*s) + 0 10 + 1 1100 + 2 12 + 3 1300 + 4 14 + dtype: Int64 + Args: cond (bool Series/DataFrame, array-like, or callable): Where cond is True, keep the original value. Where False, replace @@ -1720,6 +1763,77 @@ def where(self, cond, other): def mask(self, cond, other): """Replace values where the condition is True. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([10, 11, 12, 13, 14]) + >>> s + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: Int64 + + You can mask the values in the Series based on a condition. The values + matching the condition would be masked. + + >>> s.mask(s % 2 == 0) + 0 + 1 11 + 2 + 3 13 + 4 + dtype: Int64 + + You can specify a custom mask value. + + >>> s.mask(s % 2 == 0, -1) + 0 -1 + 1 11 + 2 -1 + 3 13 + 4 -1 + dtype: Int64 + >>> s.mask(s % 2 == 0, 100*s) + 0 1000 + 1 11 + 2 1200 + 3 13 + 4 1400 + dtype: Int64 + + You can also use a remote function to evaluate the mask condition. This + is useful in situation such as the following, where the mask + condition is evaluated based on a complicated business logic which cannot + be expressed in form of a Series. + + >>> @bpd.remote_function([str], bool, reuse=False) + ... def should_mask(name): + ... hash = 0 + ... for char_ in name: + ... hash += ord(char_) + ... return hash % 2 == 0 + + >>> s = bpd.Series(["Alice", "Bob", "Caroline"]) + >>> s + 0 Alice + 1 Bob + 2 Caroline + dtype: string + >>> s.mask(should_mask) + 0 + 1 Bob + 2 Caroline + dtype: string + >>> s.mask(should_mask, "REDACTED") + 0 REDACTED + 1 Bob + 2 Caroline + dtype: string + Args: cond (bool Series/DataFrame, array-like, or callable): Where cond is False, keep the original value. Where True, replace From a18d40e808ee0822d21715cc3e8f794c418aeebc Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 17 Nov 2023 14:42:15 -0800 Subject: [PATCH 06/26] fix: avoid unnecessary row_number() on sort key for io (#211) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/__init__.py | 12 ++++++++---- bigframes/core/compile/compiled.py | 29 ++++++++++++++++++++--------- bigframes/dataframe.py | 12 ++++-------- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index b476961bdc..e19fec8f3f 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -125,14 +125,18 @@ def to_sql( col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - if sorted or offset_column: - return self._compile_ordered().to_sql( - offset_column=offset_column, + array_value = self + if offset_column: + array_value = self.promote_offsets(offset_column) + if sorted: + return array_value._compile_ordered().to_sql( col_id_overrides=col_id_overrides, sorted=sorted, ) else: - return self._compile_unordered().to_sql(col_id_overrides=col_id_overrides) + return array_value._compile_unordered().to_sql( + col_id_overrides=col_id_overrides + ) def start_query( self, diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 78050ed4f0..461c2c005a 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -1031,31 +1031,42 @@ def _reproject_to_table(self) -> OrderedIR: def to_sql( self, - offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - offsets_id = offset_column or ORDER_ID_COLUMN - sql = ibis_bigquery.Backend().compile( self._to_ibis_expr( - ordering_mode="offset_col" - if (offset_column or sorted) - else "unordered", - order_col_name=offsets_id, + ordering_mode="unordered", col_id_overrides=col_id_overrides, + expose_hidden_cols=sorted, ) ) if sorted: + output_columns = [ + col_id_overrides.get(col) if (col in col_id_overrides) else col + for col in self.column_ids + ] + selection = ", ".join(map(lambda col_id: f"`{col_id}`", output_columns)) + order_by_clause = self._ordering_clause(self._ordering.all_ordering_columns) + sql = textwrap.dedent( - f"SELECT * EXCEPT (`{offsets_id}`)\n" + f"SELECT {selection}\n" "FROM (\n" f"{sql}\n" ")\n" - f"ORDER BY `{offsets_id}`\n" + f"{order_by_clause}\n" ) return typing.cast(str, sql) + def _ordering_clause(self, ordering: Iterable[OrderingColumnReference]) -> str: + parts = [] + for col_ref in ordering: + asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" + null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" + part = f"`{col_ref.column_id}` {asc_desc} {null_clause}" + parts.append(part) + return f"ORDER BY {' ,'.join(parts)}" + def _to_ibis_expr( self, *, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 57b4ca42cf..1f1275e217 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2577,14 +2577,10 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: } if ordering_id is not None: - return array_value.to_sql( - offset_column=ordering_id, - col_id_overrides=id_overrides, - ) - else: - return array_value.to_sql( - col_id_overrides=id_overrides, - ) + array_value = array_value.promote_offsets(ordering_id) + return array_value.to_sql( + col_id_overrides=id_overrides, + ) def _run_io_query( self, From 010486c3494e05d714da6cc7d51514518d9ae1ea Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 17 Nov 2023 15:38:14 -0800 Subject: [PATCH 07/26] docs: add code samples for df reshaping, function, merge, and join methods (#203) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #310245117 --
[310245117](https://b.corp.google.com/issues/310245117) 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 218 +++++++++++++++++- 1 file changed, 217 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b35d0f3b2e..8033c064d7 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2121,6 +2121,59 @@ def groupby( used to group large amounts of data and compute operations on these groups. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + + [4 rows x 2 columns] + + >>> df.groupby(['Animal'])['Max Speed'].mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + + We can also choose to include NA in group keys or not by setting `dropna`: + + >>> df = bpd.DataFrame([[1, 2, 3],[1, None, 4], [2, 1, 3], [1, 2, 2]], + ... columns=["a", "b", "c"]) + >>> df.groupby(by=["b"]).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + + [2 rows x 2 columns] + + >>> df.groupby(by=["b"], dropna=False).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + 1 4 + + [3 rows x 2 columns] + + We can also choose to return object with group labels or not by setting `as_index`: + + >>> df.groupby(by=["b"], as_index=False).sum() + b a c + 0 1.0 2 3 + 1 2.0 2 5 + + [2 rows x 3 columns] + Args: by (str, Sequence[str]): A label or list of labels may be passed to group by the columns @@ -2224,7 +2277,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. na_action (Optional[str], default None): - ``{None, 'ignore'}``, default None. If ‘ignore’, propagate NaN + ``{None, 'ignore'}``, default None. If `ignore`, propagate NaN values, without passing them to func. Returns: @@ -2240,6 +2293,74 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Join columns with `other` DataFrame on index + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Join two DataFrames by specifying how to handle the operation: + + >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) + >>> df1 + col1 col2 + 10 foo 1 + 11 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}, index=[11, 22]) + >>> df2 + col3 col4 + 11 foo 3 + 22 baz 4 + + [2 rows x 2 columns] + + >>> df1.join(df2) + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 foo 3 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="left") + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 foo 3 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="right") + col1 col2 col3 col4 + 11 bar 2 foo 3 + 22 baz 4 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="outer") + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 foo 3 + 22 baz 4 + + [3 rows x 4 columns] + + >>> df1.join(df2, how="inner") + col1 col2 col3 col4 + 11 bar 2 foo 3 + + [1 rows x 4 columns] + + + Another option to join using the key columns is to use the on parameter: + + >>> df1.join(df2, on="col1", how="right") + col1 col2 col3 col4 + 11 foo 3 + 22 baz 4 + + [2 rows x 4 columns] + Args: other: DataFrame with an Index similar to the Index of this one. @@ -2292,6 +2413,78 @@ def merge( rows will be matched against each other. This is different from usual SQL join behaviour and can lead to unexpected results. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Merge DataFrames df1 and df2 by specifiying type of merge: + + >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + [2 rows x 2 columns] + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + [1 rows x 3 columns] + + >>> df1.merge(df2, how='left', on='a') + a b c + 0 foo 1 3 + 1 bar 2 + + [2 rows x 3 columns] + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1 = bpd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [1, 2, 3, 5]}) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + + [4 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [5, 6, 7, 8]}) + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + [4 rows x 2 columns] + + >>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + [6 rows x 4 columns] + Args: right: Object to merge with. @@ -2342,6 +2535,29 @@ def apply(self, func, *, args=(), **kwargs): the DataFrame's index (``axis=0``) the final return type is inferred from the return type of the applied function. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + + >>> def sqaure(x): + ... return x * x + >>> df1 = df.apply(sqaure) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + Args: func (function): Function to apply to each column or row. From dd78acb174545ba292776a642afcec46f8ee4a2a Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 20 Nov 2023 10:36:14 -0800 Subject: [PATCH 08/26] fix: dedup special character (#209) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: dedup special character * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- tests/unit/core/test_bf_utils.py | 6 +++--- third_party/bigframes_vendored/pandas/io/common.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index fc34f35d9c..10ce1fd09e 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -25,7 +25,7 @@ def test_get_standardized_ids_columns(): "0", utils.UNNAMED_COLUMN_ID, "duplicate", - "duplicate.1", + "duplicate_1", "with_space", ] assert idx_ids == [] @@ -37,13 +37,13 @@ def test_get_standardized_ids_indexes(): col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels) - assert col_ids == ["duplicate.2"] + assert col_ids == ["duplicate_2"] assert idx_ids == [ "string", "0", utils.UNNAMED_INDEX_ID, "duplicate", - "duplicate.1", + "duplicate_1", "with_space", ] diff --git a/third_party/bigframes_vendored/pandas/io/common.py b/third_party/bigframes_vendored/pandas/io/common.py index 506984e64d..e186f02b5b 100644 --- a/third_party/bigframes_vendored/pandas/io/common.py +++ b/third_party/bigframes_vendored/pandas/io/common.py @@ -13,13 +13,13 @@ def dedup_names( """ Rename column names if duplicates exist. - Currently the renaming is done by appending a period and an autonumeric, - but a custom pattern may be supported in the future. + Currently the renaming is done by appending a underscore and an + autonumeric, but a custom pattern may be supported in the future. Examples ``` dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) - ['x', 'y', 'x.1', 'x.2'] + ['x', 'y', 'x_1', 'x_2'] ``` """ names = list(names) # so we can index @@ -34,9 +34,9 @@ def dedup_names( if is_potential_multiindex: # for mypy assert isinstance(col, tuple) - col = col[:-1] + (f"{col[-1]}.{cur_count}",) + col = col[:-1] + (f"{col[-1]}_{cur_count}",) else: - col = f"{col}.{cur_count}" + col = f"{col}_{cur_count}" cur_count = counts[col] names[i] = col From c88d38e69682f4c620174086b8f16f4780c04811 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 20 Nov 2023 23:02:15 +0000 Subject: [PATCH 09/26] docs: add code samples for `index` and `column` properties (#212) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated - `DataFrame.columns`: https://screenshot.googleplex.com/3Bwdb482FBfEsi2 - `DataFrame.index`: https://screenshot.googleplex.com/4iJymH3FxMn8Hhb - `Series.index`: https://screenshot.googleplex.com/7MXQcuASbQ3c8s5 Fixes internal issue 310260952 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 79 ++++++++++++++++++- .../bigframes_vendored/pandas/core/series.py | 49 +++++++++++- 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 8033c064d7..f448ad7939 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3106,6 +3106,47 @@ def index(self): index is used for label-based access and alignment, and can be accessed or modified using this attribute. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can access the index of a DataFrame via ``index`` property. + + >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> df + Name Age Location + 10 Alice 25 Seattle + 20 Bob 30 New York + 30 Aritra 35 Kona + + [3 rows x 3 columns] + >>> df.index # doctest: +ELLIPSIS + + >>> df.index.values + array([10, 20, 30], dtype=object) + + Let's try setting a new index for the dataframe and see that reflect via + ``index`` property. + + >>> df1 = df.set_index(["Name", "Location"]) + >>> df1 + Age + Name Location + Alice Seattle 25 + Bob New York 30 + Aritra Kona 35 + + [3 rows x 1 columns] + >>> df1.index # doctest: +ELLIPSIS + + >>> df1.index.values + array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], + dtype=object) + Returns: The index labels of the DataFrame. """ @@ -3113,7 +3154,43 @@ def index(self): @property def columns(self): - "The column labels of the DataFrame." + """The column labels of the DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can access the column labels of a DataFrame via ``columns`` property. + + >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> df + Name Age Location + 10 Alice 25 Seattle + 20 Bob 30 New York + 30 Aritra 35 Kona + + [3 rows x 3 columns] + >>> df.columns + Index(['Name', 'Age', 'Location'], dtype='object') + + You can also set new labels for columns. + + >>> df.columns = ["NewName", "NewAge", "NewLocation"] + >>> df + NewName NewAge NewLocation + 10 Alice 25 Seattle + 20 Bob 30 New York + 30 Aritra 35 Kona + + [3 rows x 3 columns] + >>> df.columns + Index(['NewName', 'NewAge', 'NewLocation'], dtype='object') + + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def value_counts( diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 01175dc0ef..a86765a412 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -44,7 +44,54 @@ def struct(self): @property def index(self): - """The index (axis labels) of the Series.""" + """The index (axis labels) of the Series. + + The index of a Series is used to label and identify each element of the + underlying data. The index can be thought of as an immutable ordered set + (technically a multi-set, as it may contain duplicate labels), and is + used to index and align data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can access the index of a Series via ``index`` property. + + >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> s = df["Age"] + >>> s + 10 25 + 20 30 + 30 35 + Name: Age, dtype: Int64 + >>> s.index # doctest: +ELLIPSIS + + >>> s.index.values + array([10, 20, 30], dtype=object) + + Let's try setting a multi-index case reflect via ``index`` property. + + >>> df1 = df.set_index(["Name", "Location"]) + >>> s1 = df1["Age"] + >>> s1 + Name Location + Alice Seattle 25 + Bob New York 30 + Aritra Kona 35 + Name: Age, dtype: Int64 + >>> s1.index # doctest: +ELLIPSIS + + >>> s1.index.values + array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], + dtype=object) + + Returns: + The index labels of the Series. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property From 1d1477158b5a8e84d099e07c6f566182a1abd7fc Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 21 Nov 2023 16:16:25 +0000 Subject: [PATCH 10/26] test: re-enable `system_prerelease` tests (#221) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: re-enable `system_prerelease` tests * exclude ibis from prerelease install list * install explicit 6.2.0 version for ibis in pre prelease * add unit_prerelease to pre and post submit e2e tests * Update noxfile.py * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Tim Swast Co-authored-by: Owl Bot --- .kokoro/continuous/e2e.cfg | 2 +- .kokoro/presubmit/e2e.cfg | 2 +- noxfile.py | 50 ++++++++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg index d875f36060..2f93a58212 100644 --- a/.kokoro/continuous/e2e.cfg +++ b/.kokoro/continuous/e2e.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_noextras e2e notebook samples" + value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" } diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg index d875f36060..2f93a58212 100644 --- a/.kokoro/presubmit/e2e.cfg +++ b/.kokoro/presubmit/e2e.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_noextras e2e notebook samples" + value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" } diff --git a/noxfile.py b/noxfile.py index da61232fc7..8d6d641fc1 100644 --- a/noxfile.py +++ b/noxfile.py @@ -494,6 +494,11 @@ def prerelease(session: nox.sessions.Session, tests_path): CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) + # Ignore officially released versions of certain packages specified in + # testing/constraints-*.txt and install a more recent, pre-release versions + # directly + already_installed = set() + # PyArrow prerelease packages are published to an alternative PyPI host. # https://arrow.apache.org/docs/python/install.html#installing-nightly-packages session.install( @@ -504,6 +509,8 @@ def prerelease(session: nox.sessions.Session, tests_path): "--upgrade", "pyarrow", ) + already_installed.add("pyarrow") + session.install( "--extra-index-url", "https://pypi.anaconda.org/scipy-wheels-nightly/simple", @@ -512,16 +519,48 @@ def prerelease(session: nox.sessions.Session, tests_path): "--upgrade", "pandas", ) + already_installed.add("pandas") + + # TODO(shobs): + # Commit https://github.com/ibis-project/ibis/commit/c20ba7feab6bdea6c299721310e04dbc10551cc2 + # introduced breaking change that removed the following: + # ibis.expr.rules.column + # ibis.expr.rules.value + # ibis.expr.rules.any + # Let's exclude ibis head from prerelease install list for now. Instead, use + # a working ibis-framework version resolved via setup.by (currently resolves + # to version 6.2.0 due to version requirement "6.2.0,<7.0.0dev"). + # We should enable the head back once bigframes support a version that + # includes the above commit. + # session.install( + # "--upgrade", + # "-e", # Use -e so that py.typed file is included. + # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + # ) + session.install("--no-deps", "ibis-framework==6.2.0") + already_installed.add("ibis-framework") + + # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 + session.install("--no-deps", "db-dtypes") + already_installed.add("db-dtypes") + + # Ensure we catch breaking changes in the client libraries early. + session.install( + "--upgrade", + "-e", + "git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery", + ) + already_installed.add("google-cloud-bigquery") session.install( "--upgrade", - "-e", # Use -e so that py.typed file is included. - "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + "-e", + "git+https://github.com/googleapis/python-bigquery-storage.git#egg=google-cloud-bigquery-storage", ) - # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 - session.install("--no-deps", "db-dtypes") + already_installed.add("google-cloud-bigquery-storage") # Workaround to install pandas-gbq >=0.15.0, which is required by test only. session.install("--no-deps", "pandas-gbq") + already_installed.add("pandas-gbq") session.install( *set(UNIT_TEST_STANDARD_DEPENDENCIES + SYSTEM_TEST_STANDARD_DEPENDENCIES), @@ -541,9 +580,6 @@ def prerelease(session: nox.sessions.Session, tests_path): constraints_text = constraints_file.read() # Ignore leading whitespace and comment lines. - already_installed = frozenset( - ("db-dtypes", "pandas", "pyarrow", "ibis-framework", "pandas-gbq") - ) deps = [ match.group(1) for match in re.finditer( From ed8876d3439a3b45b65e8789737c3c2e3a7f1adb Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:02:31 -0800 Subject: [PATCH 11/26] feat: add the recent api method for ML component (#225) * feat: add the recent api method for ML component --- bigframes/ml/cluster.py | 2 ++ bigframes/ml/compose.py | 2 ++ bigframes/ml/decomposition.py | 2 ++ bigframes/ml/ensemble.py | 5 +++++ bigframes/ml/forecasting.py | 2 ++ bigframes/ml/imported.py | 3 +++ bigframes/ml/linear_model.py | 3 +++ bigframes/ml/llm.py | 4 +++- bigframes/ml/pipeline.py | 2 ++ bigframes/ml/preprocessing.py | 7 +++++++ tests/unit/session/test_io_bigquery.py | 3 +++ 11 files changed, 34 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index c9f52ba0b6..6b79d356a2 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -22,11 +22,13 @@ from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.cluster._kmeans +@log_adapter.class_logger class KMeans( base.UnsupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans, diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index bf046ff691..ace876dd2d 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -22,6 +22,7 @@ from typing import List, Optional, Tuple, Union from bigframes import constants +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.compose._column_transformer @@ -36,6 +37,7 @@ ] +@log_adapter.class_logger class ColumnTransformer( base.Transformer, third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 7cda7a6993..ef777cb33a 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -22,11 +22,13 @@ from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.decomposition._pca +@log_adapter.class_logger class PCA( base.UnsupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.decomposition._pca.PCA, diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index fcb3fe5343..1cc9fb3739 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -22,6 +22,7 @@ from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.ensemble._forest @@ -47,6 +48,7 @@ } +@log_adapter.class_logger class XGBRegressor( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor, @@ -202,6 +204,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class XGBClassifier( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier, @@ -356,6 +359,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class RandomForestRegressor( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, @@ -521,6 +525,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class RandomForestClassifier( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index cf23854fa0..995201062b 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -21,10 +21,12 @@ from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +@log_adapter.class_logger class ARIMAPlus(base.SupervisedTrainablePredictor): """Time Series ARIMA Plus model.""" diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index f6afc9aa38..4ae0a8ea4d 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -21,10 +21,12 @@ from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +@log_adapter.class_logger class TensorFlowModel(base.Predictor): """Imported TensorFlow model. @@ -101,6 +103,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class ONNXModel(base.Predictor): """Imported Open Neural Network Exchange (ONNX) model. diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 433d9fbc38..5ee87b8850 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -23,6 +23,7 @@ import bigframes import bigframes.constants as constants +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.linear_model._base @@ -46,6 +47,7 @@ } +@log_adapter.class_logger class LinearRegression( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression, @@ -178,6 +180,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class LogisticRegression( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 78f3369daf..5beb54a32d 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -21,7 +21,7 @@ import bigframes from bigframes import clients, constants -from bigframes.core import blocks +from bigframes.core import blocks, log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -43,6 +43,7 @@ _ML_EMBED_TEXT_STATUS = "ml_embed_text_status" +@log_adapter.class_logger class PaLM2TextGenerator(base.Predictor): """PaLM2 text generator LLM model. @@ -200,6 +201,7 @@ def predict( return df +@log_adapter.class_logger class PaLM2TextEmbeddingGenerator(base.Predictor): """PaLM2 text embedding generator LLM model. diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index ad0b3fae11..4ae2bfe555 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -24,11 +24,13 @@ import bigframes import bigframes.constants as constants +from bigframes.core import log_adapter from bigframes.ml import base, compose, forecasting, loader, preprocessing, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.pipeline +@log_adapter.class_logger class Pipeline( base.BaseEstimator, third_party.bigframes_vendored.sklearn.pipeline.Pipeline, diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 5f44d40218..a403e57e71 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -20,6 +20,7 @@ import typing from typing import Any, cast, List, Literal, Optional, Tuple, Union +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data @@ -28,6 +29,7 @@ import third_party.bigframes_vendored.sklearn.preprocessing._label +@log_adapter.class_logger class StandardScaler( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, @@ -111,6 +113,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class MaxAbsScaler( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler, @@ -194,6 +197,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class MinMaxScaler( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler, @@ -277,6 +281,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class KBinsDiscretizer( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer, @@ -395,6 +400,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class OneHotEncoder( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, @@ -524,6 +530,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class LabelEncoder( base.LabelTransformer, third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index e1481d3f05..c87835c412 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -59,6 +59,7 @@ def test_create_job_configs_labels_length_limit_not_met(): def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): + log_adapter.get_and_reset_api_methods() cur_labels = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", @@ -87,6 +88,7 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): + log_adapter.get_and_reset_api_methods() df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) # Test running methods more than the labels' length limit for i in range(66): @@ -102,6 +104,7 @@ def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): def test_create_job_configs_labels_length_limit_met(): + log_adapter.get_and_reset_api_methods() cur_labels = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", From d7957fad071d223ef8f6fb8f3de395c865ff60aa Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:26:52 -0800 Subject: [PATCH 12/26] docs: code samples for dataframe.any, dataframe.all and dataframe.prod (#223) * docs: code samples for dataframe.any, dataframe.all and dataframe.prod * Update examples * update example output --- .../bigframes_vendored/pandas/core/frame.py | 85 ++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f448ad7939..b771be3041 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2584,6 +2584,33 @@ def any(self, *, axis=0, bool_only: bool = False): along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) + >>> df + A B + 0 True False + 1 True False + + [2 rows x 2 columns] + + Checking if each column contains at least one True element(the default behavior without an explicit axis parameter). + + >>> df.any() + A True + B False + dtype: boolean + + Checking if each row contains at least one True element. + + >>> df.any(axis=1) + 0 True + 1 True + dtype: boolean + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2604,6 +2631,33 @@ def all(self, axis=0, *, bool_only: bool = False): along a DataFrame axis that is False or equivalent (e.g. zero or empty). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) + >>> df + A B + 0 True False + 1 True False + + [2 rows x 2 columns] + + Checking if all values in each column are True(the default behavior without an explicit axis parameter). + + >>> df.all() + A True + B False + dtype: boolean + + Checking across rows to see if all values are True. + + >>> df.all(axis=1) + 0 False + 1 False + dtype: boolean + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2620,8 +2674,37 @@ def prod(self, axis=0, *, numeric_only: bool = False): """ Return the product of the values over the requested axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) + >>> df + A B + 0 1 4.5 + 1 2 5.5 + 2 3 6.5 + + [3 rows x 2 columns] + + Calculating the product of each column(the default behavior without an explicit axis parameter). + + >>> df.prod() + A 6.0 + B 160.875 + dtype: Float64 + + Calculating the product of each row. + + >>> df.prod(axis=1) + 0 4.5 + 1 11.0 + 2 19.5 + dtype: Float64 + Args: - aßxis ({index (0), columns (1)}): + axis ({index (0), columns (1)}): Axis for the function to be applied on. For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): From 71844b03cdbfe684320c186a0488c8c7fb4fcd6e Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 22 Nov 2023 23:46:14 +0000 Subject: [PATCH 13/26] docs: make the code samples reflect default bq connection usage (#206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 305136837 🦕 --- .../getting_started_bq_dataframes.ipynb | 4 +- .../remote_functions/remote_function.ipynb | 514 ++++++++---------- samples/snippets/remote_function.py | 6 +- 3 files changed, 241 insertions(+), 283 deletions(-) diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 6cc6acc993..18be5e48fd 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -802,7 +802,7 @@ "source": [ "Running the cell below creates a custom function using the `remote_function` method. This function categorizes a value into one of two buckets: >= 4000 or <4000.\n", "\n", - "> Note: Creating a function requires a [BigQuery connection](https://cloud.google.com/bigquery/docs/remote-functions#create_a_remote_function). This code assumes a pre-created connection named `bigframes-rf-conn`. If\n", + "> Note: Creating a function requires a [BigQuery connection](https://cloud.google.com/bigquery/docs/remote-functions#create_a_remote_function). This code assumes a pre-created connection named `bigframes-default-connection`. If\n", "the connection is not already created, BigQuery DataFrames attempts to create one assuming the [necessary APIs\n", "and IAM permissions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) are set up in the project.\n", "\n", @@ -817,7 +817,7 @@ }, "outputs": [], "source": [ - "@bf.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", + "@bf.remote_function([float], str)\n", "def get_bucket(num):\n", " if not num: return \"NA\"\n", " boundary = 4000\n", diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index 06be0e7293..063c1738b4 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "id": "3613b1cd", "metadata": {}, "outputs": [], @@ -16,24 +16,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "id": "f1175247", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 25.4 s, sys: 2.5 s, total: 27.9 s\n", - "Wall time: 2min 31s\n" + "CPU times: user 2.34 s, sys: 307 ms, total: 2.65 s\n", + "Wall time: 17.8 s\n" ] }, { @@ -141,7 +133,7 @@ "9 154 Sure, but what about a solution using O(1) mem... 8" ] }, - "execution_count": 3, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -160,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "id": "fd8a04a3", "metadata": {}, "outputs": [], @@ -191,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "id": "2b5e4568", "metadata": {}, "outputs": [ @@ -199,8 +191,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.22 s, sys: 18.2 ms, total: 4.24 s\n", - "Wall time: 4.26 s\n" + "CPU times: user 3.32 s, sys: 0 ns, total: 3.32 s\n", + "Wall time: 3.32 s\n" ] }, { @@ -319,7 +311,7 @@ "9 154 Sure, but what about a solution using O(1) mem... 8 19" ] }, - "execution_count": 5, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -333,65 +325,25 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "id": "b81feaef", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2b1c9d671db14d2ca3be6a0b0c698430", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 6b0a39de-40a0-4dd4-be88-248bd8ebcd77 is RUNNING. " ] }, "metadata": {}, @@ -399,13 +351,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "09706700e8dd4cf39f65a0d58371c1eb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job a283cb39-41b1-44cd-a6c3-f2a2c6a55b25 is DONE. 17.2 GB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4c1d9d3e-be25-4818-b74d-6214164d99ab is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -440,62 +390,62 @@ " \n", " \n", " 0\n", - " 11012908\n", - " you're welcome! according to the docs it shoul...\n", + " 11231597\n", + " In your update, why are some of the system fun...\n", " 0\n", " \n", " \n", " 1\n", - " 11013760\n", - " You *should* be concerned with the disk being ...\n", - " 0\n", + " 49684807\n", + " what you have tried so far . ??\n", + " 1\n", " \n", " \n", " 2\n", - " 11013784\n", - " have you looked at `Integrate` or `NIntegrate`?\n", + " 7623925\n", + " @Michael: It should work. Perhaps you looked i...\n", " 0\n", " \n", " \n", " 3\n", - " 11015512\n", - " sorry, is a typo. The variable name is dist. (...\n", + " 34046685\n", + " Will it work with SQL compact? Please excuse m...\n", " 0\n", " \n", " \n", " 4\n", - " 11016238\n", - " Pfff, I'm having trouble with that formula too...\n", + " 6426146\n", + " do you know the equation to your pdf?\n", " 0\n", " \n", " \n", " 5\n", - " 11016276\n", - " Thanks thinksteep! Does this mean that by usin...\n", + " 60686114\n", + " m sorry but at least you have to think about it.\n", " 0\n", " \n", " \n", " 6\n", - " 11016551\n", - " Jason, thanks for the reply. I've been workin...\n", + " 16631986\n", + " i think also making disable this by only jquer...\n", " 0\n", " \n", " \n", " 7\n", - " 11017973\n", - " I assume an `off` of 0.5 would put be exactly ...\n", + " 16498565\n", + " I am including these files on my header of the...\n", " 0\n", " \n", " \n", " 8\n", - " 11018225\n", - " Thank you very much. I do worry too much abou...\n", + " 26601001\n", + " wrong answer, you didn't understand the logic\n", " 0\n", " \n", " \n", " 9\n", - " 11018370\n", - " @IanClelland, I edited my question a bit. The ...\n", + " 73255842\n", + " Call the setOnClickListener before return row.\n", " 0\n", " \n", " \n", @@ -505,21 +455,21 @@ ], "text/plain": [ " id text score\n", - "0 11012908 you're welcome! according to the docs it shoul... 0\n", - "1 11013760 You *should* be concerned with the disk being ... 0\n", - "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0\n", - "3 11015512 sorry, is a typo. The variable name is dist. (... 0\n", - "4 11016238 Pfff, I'm having trouble with that formula too... 0\n", - "5 11016276 Thanks thinksteep! Does this mean that by usin... 0\n", - "6 11016551 Jason, thanks for the reply. I've been workin... 0\n", - "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0\n", - "8 11018225 Thank you very much. I do worry too much abou... 0\n", - "9 11018370 @IanClelland, I edited my question a bit. The ... 0\n", + "0 11231597 In your update, why are some of the system fun... 0\n", + "1 49684807 what you have tried so far . ?? 1\n", + "2 7623925 @Michael: It should work. Perhaps you looked i... 0\n", + "3 34046685 Will it work with SQL compact? Please excuse m... 0\n", + "4 6426146 do you know the equation to your pdf? 0\n", + "5 60686114 m sorry but at least you have to think about it. 0\n", + "6 16631986 i think also making disable this by only jquer... 0\n", + "7 16498565 I am including these files on my header of the... 0\n", + "8 26601001 wrong answer, you didn't understand the logic 0\n", + "9 73255842 Call the setOnClickListener before return row. 0\n", "\n", "[10 rows x 3 columns]" ] }, - "execution_count": 6, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -539,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 24, "id": "55ed241e", "metadata": {}, "outputs": [ @@ -549,8 +499,9 @@ "text": [ "Help on function remote_function in module bigframes.pandas:\n", "\n", - "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", - " Decorator to turn a user defined function into a BigQuery remote function.\n", + "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True, name: 'Optional[str]' = None, packages: 'Optional[Sequence[str]]' = None)\n", + " Decorator to turn a user defined function into a BigQuery remote function. Check out\n", + " the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.\n", " \n", " .. note::\n", " Please make sure following is setup before using this API:\n", @@ -576,7 +527,7 @@ " * BigQuery Data Editor (roles/bigquery.dataEditor)\n", " * BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", " * Cloud Functions Developer (roles/cloudfunctions.developer)\n", - " * Service Account User (roles/iam.serviceAccountUser)\n", + " * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com`\n", " * Storage Object Viewer (roles/storage.objectViewer)\n", " * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", " \n", @@ -602,15 +553,25 @@ " Name of the BigQuery connection. You should either have the\n", " connection already created in the `location` you have chosen, or\n", " you should have the Project IAM Admin role to enable the service\n", - " to create the connection for you if you need it.If this parameter is\n", + " to create the connection for you if you need it. If this parameter is\n", " not provided then the BigQuery connection from the session is used.\n", " reuse (bool, Optional):\n", " Reuse the remote function if already exists.\n", " `True` by default, which will result in reusing an existing remote\n", - " function (if any) that was previously created for the same udf.\n", - " Setting it to false would force creating a unique remote function.\n", + " function and corresponding cloud function (if any) that was\n", + " previously created for the same udf.\n", + " Setting it to `False` would force creating a unique remote function.\n", " If the required remote function does not exist then it would be\n", " created irrespective of this param.\n", + " name (str, Optional):\n", + " Explicit name of the persisted BigQuery remote function. Use it with\n", + " caution, because two users working in the same project and dataset\n", + " could overwrite each other's remote functions if they use the same\n", + " persistent name.\n", + " packages (str[], Optional):\n", + " Explicit name of the external package dependencies. Each dependency\n", + " is added to the `requirements.txt` as is, and can be of the form\n", + " supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/.\n", " Returns:\n", " callable: A remote function object pointing to the cloud assets created\n", " in the background to support the remote execution. The cloud assets can be\n", @@ -631,49 +592,16 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "c9a8d03d", - "metadata": {}, - "outputs": [], - "source": [ - "# BigQuery DataFrames user is a data scientist and may not have privileges to\n", - "# create a BQ connector and set it up for invoking a cloud function. They\n", - "# should get such a connector created from their cloud admin and use it with\n", - "# BigQuery DataFrames remote functions. If the provided connection name does not\n", - "# exist, BigQuery DataFrames will try to create it on the fly assuming the user\n", - "# has sufficient privileges.\n", - "bq_connection_name = 'bigframes-rf-conn'" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": 25, "id": "fbc27f81", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[INFO][2023-08-18 21:23:29,687][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmpl2ewfnue --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", - "[INFO][2023-08-18 21:24:43,689][bigframes.remote_function] Successfully created cloud function bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 with uri (https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app)\n", - "[INFO][2023-08-18 21:24:57,348][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", - "[INFO][2023-08-18 21:24:57,351][bigframes.remote_function] Creating BQ remote function: \n", - " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4(n INT64)\n", - " RETURNS INT64\n", - " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", - " OPTIONS (\n", - " endpoint = \"https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app\"\n", - " )\n", - "[INFO][2023-08-18 21:24:58,300][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "\n", - "Wall time: 89.0601 s\n" + "Wall time: 76.2628 s\n" ] } ], @@ -684,7 +612,7 @@ "\n", "# User defined function\n", "# https://www.codespeedy.com/find-nth-prime-number-in-python/\n", - "@pd.remote_function([int], int, bigquery_connection=bq_connection_name)\n", + "@pd.remote_function([int], int, reuse=False)\n", "def nth_prime(n):\n", " prime_numbers = [2,3]\n", " i=3\n", @@ -712,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 26, "id": "c1c9355f", "metadata": {}, "outputs": [ @@ -720,33 +648,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 16.8 ms, sys: 61 µs, total: 16.8 ms\n", - "Wall time: 17 ms\n" + "CPU times: user 55.8 ms, sys: 182 µs, total: 56 ms\n", + "Wall time: 54.5 ms\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2f840ad27c514ed19c759a004b32de33", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 0f421233-9d02-4746-bb39-86a3b0880aba is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4f8d5734-8070-4630-8a59-c05a31d60476 is RUNNING. " ] }, "metadata": {}, @@ -754,13 +666,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "116d6ef3d6b247d3aaafef5fe6b970de", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job c0a2c187-364d-4978-97bc-30352828f624 is DONE. 17.2 GB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ec057f9e-726b-44f0-a5c0-24c05c7ecfeb is RUNNING. " ] }, "metadata": {}, @@ -796,71 +706,71 @@ " \n", " \n", " 0\n", - " 11012908\n", - " you're welcome! according to the docs it shoul...\n", + " 11231597\n", + " In your update, why are some of the system fun...\n", " 0\n", " -1\n", " \n", " \n", " 1\n", - " 11013760\n", - " You *should* be concerned with the disk being ...\n", - " 0\n", - " -1\n", + " 49684807\n", + " what you have tried so far . ??\n", + " 1\n", + " 2\n", " \n", " \n", " 2\n", - " 11013784\n", - " have you looked at `Integrate` or `NIntegrate`?\n", + " 7623925\n", + " @Michael: It should work. Perhaps you looked i...\n", " 0\n", " -1\n", " \n", " \n", " 3\n", - " 11015512\n", - " sorry, is a typo. The variable name is dist. (...\n", + " 34046685\n", + " Will it work with SQL compact? Please excuse m...\n", " 0\n", " -1\n", " \n", " \n", " 4\n", - " 11016238\n", - " Pfff, I'm having trouble with that formula too...\n", + " 6426146\n", + " do you know the equation to your pdf?\n", " 0\n", " -1\n", " \n", " \n", " 5\n", - " 11016276\n", - " Thanks thinksteep! Does this mean that by usin...\n", + " 60686114\n", + " m sorry but at least you have to think about it.\n", " 0\n", " -1\n", " \n", " \n", " 6\n", - " 11016551\n", - " Jason, thanks for the reply. I've been workin...\n", + " 16631986\n", + " i think also making disable this by only jquer...\n", " 0\n", " -1\n", " \n", " \n", " 7\n", - " 11017973\n", - " I assume an `off` of 0.5 would put be exactly ...\n", + " 16498565\n", + " I am including these files on my header of the...\n", " 0\n", " -1\n", " \n", " \n", " 8\n", - " 11018225\n", - " Thank you very much. I do worry too much abou...\n", + " 26601001\n", + " wrong answer, you didn't understand the logic\n", " 0\n", " -1\n", " \n", " \n", " 9\n", - " 11018370\n", - " @IanClelland, I edited my question a bit. The ...\n", + " 73255842\n", + " Call the setOnClickListener before return row.\n", " 0\n", " -1\n", " \n", @@ -871,21 +781,21 @@ ], "text/plain": [ " id text score n_prime\n", - "0 11012908 you're welcome! according to the docs it shoul... 0 -1\n", - "1 11013760 You *should* be concerned with the disk being ... 0 -1\n", - "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 -1\n", - "3 11015512 sorry, is a typo. The variable name is dist. (... 0 -1\n", - "4 11016238 Pfff, I'm having trouble with that formula too... 0 -1\n", - "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 -1\n", - "6 11016551 Jason, thanks for the reply. I've been workin... 0 -1\n", - "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 -1\n", - "8 11018225 Thank you very much. I do worry too much abou... 0 -1\n", - "9 11018370 @IanClelland, I edited my question a bit. The ... 0 -1\n", + "0 11231597 In your update, why are some of the system fun... 0 -1\n", + "1 49684807 what you have tried so far . ?? 1 2\n", + "2 7623925 @Michael: It should work. Perhaps you looked i... 0 -1\n", + "3 34046685 Will it work with SQL compact? Please excuse m... 0 -1\n", + "4 6426146 do you know the equation to your pdf? 0 -1\n", + "5 60686114 m sorry but at least you have to think about it. 0 -1\n", + "6 16631986 i think also making disable this by only jquer... 0 -1\n", + "7 16498565 I am including these files on my header of the... 0 -1\n", + "8 26601001 wrong answer, you didn't understand the logic 0 -1\n", + "9 73255842 Call the setOnClickListener before return row. 0 -1\n", "\n", "[10 rows x 4 columns]" ] }, - "execution_count": 10, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -900,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 27, "id": "2701cb81", "metadata": {}, "outputs": [ @@ -908,8 +818,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n", - "projects/bigframes-dev/locations/us-central1/functions/bigframes-b0feb1fbaf8188b64d7e70118d93c5d4\n" + "shobs-test.bigframes_temp_us.bigframes_343b7b4bb93ca8747dae20c22bdaec8b_p27heyce\n", + "projects/shobs-test/locations/us-central1/functions/bigframes-343b7b4bb93ca8747dae20c22bdaec8b-p27heyce\n" ] } ], @@ -922,7 +832,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 28, "id": "920fa18e", "metadata": {}, "outputs": [ @@ -937,6 +847,42 @@ " \n", " Then it can be applied to a DataFrame or Series.\n", " \n", + " .. note::\n", + " The return type of the function must be explicitly specified in the\n", + " function's original definition even if not otherwise required.\n", + " \n", + " BigQuery Utils provides many public functions under the ``bqutil`` project on Google Cloud Platform project\n", + " (See: https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs#using-the-udfs).\n", + " You can checkout Community UDFs to use community-contributed functions.\n", + " (See: https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs/community#community-udfs).\n", + " \n", + " **Examples:**\n", + " \n", + " Use the ``cw_lower_case_ascii_only`` function from Community UDFs.\n", + " (https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/cw_lower_case_ascii_only.sqlx)\n", + " \n", + " >>> import bigframes.pandas as bpd\n", + " >>> bpd.options.display.progress_bar = None\n", + " \n", + " >>> df = bpd.DataFrame({'id': [1, 2, 3], 'name': ['AURÉLIE', 'CÉLESTINE', 'DAPHNÉ']})\n", + " >>> df\n", + " id name\n", + " 0 1 AURÉLIE\n", + " 1 2 CÉLESTINE\n", + " 2 3 DAPHNÉ\n", + " \n", + " [3 rows x 2 columns]\n", + " \n", + " >>> func = bpd.read_gbq_function(\"bqutil.fn.cw_lower_case_ascii_only\")\n", + " >>> df1 = df.assign(new_name=df['name'].apply(func))\n", + " >>> df1\n", + " id name new_name\n", + " 0 1 AURÉLIE aurÉlie\n", + " 1 2 CÉLESTINE cÉlestine\n", + " 2 3 DAPHNÉ daphnÉ\n", + " \n", + " [3 rows x 3 columns]\n", + " \n", " Args:\n", " function_name (str):\n", " the function's name in BigQuery in the format\n", @@ -965,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 29, "id": "a6c9da0a", "metadata": {}, "outputs": [], @@ -978,7 +924,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 30, "id": "d7e7de7f", "metadata": {}, "outputs": [ @@ -986,33 +932,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 10.9 ms, sys: 0 ns, total: 10.9 ms\n", - "Wall time: 11.4 ms\n" + "CPU times: user 70.8 ms, sys: 3.49 ms, total: 74.3 ms\n", + "Wall time: 75.2 ms\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "73d1a73593cb4115821ab128c221a48d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job bec5f7d1-3df1-4292-8c68-c396bce7dc5d is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 02e3bf43-a387-41c7-85c7-4a5366251de7 is RUNNING. " ] }, "metadata": {}, @@ -1020,13 +950,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "faf93766ce1e489183c86a9daf5ce7d1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 4d3da7ed-42e6-4b2b-b656-ac9ef6d2e871 is DONE. 17.2 GB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job fa4329e8-2918-44c4-96c5-d8591364abc9 is RUNNING. " ] }, "metadata": {}, @@ -1063,80 +991,80 @@ " \n", " \n", " 0\n", - " 11012908\n", - " you're welcome! according to the docs it shoul...\n", + " 11231597\n", + " In your update, why are some of the system fun...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 1\n", - " 11013760\n", - " You *should* be concerned with the disk being ...\n", - " 0\n", - " -1\n", - " -1\n", + " 49684807\n", + " what you have tried so far . ??\n", + " 1\n", + " 2\n", + " 2\n", " \n", " \n", " 2\n", - " 11013784\n", - " have you looked at `Integrate` or `NIntegrate`?\n", + " 7623925\n", + " @Michael: It should work. Perhaps you looked i...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 3\n", - " 11015512\n", - " sorry, is a typo. The variable name is dist. (...\n", + " 34046685\n", + " Will it work with SQL compact? Please excuse m...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 4\n", - " 11016238\n", - " Pfff, I'm having trouble with that formula too...\n", + " 6426146\n", + " do you know the equation to your pdf?\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 5\n", - " 11016276\n", - " Thanks thinksteep! Does this mean that by usin...\n", + " 60686114\n", + " m sorry but at least you have to think about it.\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 6\n", - " 11016551\n", - " Jason, thanks for the reply. I've been workin...\n", + " 16631986\n", + " i think also making disable this by only jquer...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 7\n", - " 11017973\n", - " I assume an `off` of 0.5 would put be exactly ...\n", + " 16498565\n", + " I am including these files on my header of the...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 8\n", - " 11018225\n", - " Thank you very much. I do worry too much abou...\n", + " 26601001\n", + " wrong answer, you didn't understand the logic\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 9\n", - " 11018370\n", - " @IanClelland, I edited my question a bit. The ...\n", + " 73255842\n", + " Call the setOnClickListener before return row.\n", " 0\n", " -1\n", " -1\n", @@ -1148,20 +1076,20 @@ ], "text/plain": [ " id text score \\\n", - "0 11012908 you're welcome! according to the docs it shoul... 0 \n", - "1 11013760 You *should* be concerned with the disk being ... 0 \n", - "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 \n", - "3 11015512 sorry, is a typo. The variable name is dist. (... 0 \n", - "4 11016238 Pfff, I'm having trouble with that formula too... 0 \n", - "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 \n", - "6 11016551 Jason, thanks for the reply. I've been workin... 0 \n", - "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 \n", - "8 11018225 Thank you very much. I do worry too much abou... 0 \n", - "9 11018370 @IanClelland, I edited my question a bit. The ... 0 \n", + "0 11231597 In your update, why are some of the system fun... 0 \n", + "1 49684807 what you have tried so far . ?? 1 \n", + "2 7623925 @Michael: It should work. Perhaps you looked i... 0 \n", + "3 34046685 Will it work with SQL compact? Please excuse m... 0 \n", + "4 6426146 do you know the equation to your pdf? 0 \n", + "5 60686114 m sorry but at least you have to think about it. 0 \n", + "6 16631986 i think also making disable this by only jquer... 0 \n", + "7 16498565 I am including these files on my header of the... 0 \n", + "8 26601001 wrong answer, you didn't understand the logic 0 \n", + "9 73255842 Call the setOnClickListener before return row. 0 \n", "\n", " n_prime n_prime_again \n", "0 -1 -1 \n", - "1 -1 -1 \n", + "1 2 2 \n", "2 -1 -1 \n", "3 -1 -1 \n", "4 -1 -1 \n", @@ -1174,7 +1102,7 @@ "[10 rows x 5 columns]" ] }, - "execution_count": 15, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1186,6 +1114,38 @@ "df = df.assign(n_prime_again=df['score'].apply(nth_prime_existing))\n", "df.head(10)" ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "bafab950", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up GCP assets created as part of bigframes remote_function\n", + "def cleanup_remote_function_assets(remote_udf, ignore_failures=False):\n", + " \"\"\"Clean up the GCP assets behind a bigframes remote function.\"\"\"\n", + "\n", + " session = pd.get_global_session()\n", + "\n", + " # Clean up BQ remote function\n", + " try:\n", + " session.bqclient.delete_routine(remote_udf.bigframes_remote_function)\n", + " except Exception:\n", + " # By default don't raise exception in cleanup\n", + " if not ignore_failures:\n", + " raise\n", + "\n", + " # Clean up cloud function\n", + " try:\n", + " session.cloudfunctionsclient.delete_function(name=remote_udf.bigframes_cloud_function)\n", + " except Exception:\n", + " # By default don't raise exception in cleanup\n", + " if not ignore_failures:\n", + " raise\n", + "\n", + "cleanup_remote_function_assets(nth_prime)" + ] } ], "metadata": { diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index 646d7b0c30..61b7dc092a 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -38,8 +38,8 @@ def run_remote_function_and_read_gbq_function(project_id: str): # function. It requires a BigQuery connection. If the connection is not # already created, BigQuery DataFrames will attempt to create one assuming # the necessary APIs and IAM permissions are setup in the project. In our - # examples we would be using a pre-created connection named - # `bigframes-rf-conn`. We will also set `reuse=False` to make sure we don't + # examples we will be letting the default connection `bigframes-default-connection` + # be used. We will also set `reuse=False` to make sure we don't # step over someone else creating remote function in the same project from # the exact same source code at the same time. Let's try a `pandas`-like use # case in which we want to apply a user defined scalar function to every @@ -49,7 +49,6 @@ def run_remote_function_and_read_gbq_function(project_id: str): @bpd.remote_function( [float], str, - bigquery_connection="bigframes-rf-conn", reuse=False, ) def get_bucket(num): @@ -94,7 +93,6 @@ def get_bucket(num): @bpd.remote_function( [str], str, - bigquery_connection="bigframes-rf-conn", reuse=False, packages=["cryptography"], ) From 3a375e87b64b8fb51370bfec8f2cfdbcd8fe960a Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:52:22 -0800 Subject: [PATCH 14/26] docs: add examples for dataframe.min, dataframe.max and dataframe.sum (#227) * docs: add examples for dataframe.min, dataframe.max and dataframe.sum * update spacing --- .../bigframes_vendored/pandas/core/frame.py | 87 ++++++++++++++++++- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b771be3041..e41ac905aa 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2597,7 +2597,7 @@ def any(self, *, axis=0, bool_only: bool = False): [2 rows x 2 columns] - Checking if each column contains at least one True element(the default behavior without an explicit axis parameter). + Checking if each column contains at least one True element (the default behavior without an explicit axis parameter). >>> df.any() A True @@ -2644,7 +2644,7 @@ def all(self, axis=0, *, bool_only: bool = False): [2 rows x 2 columns] - Checking if all values in each column are True(the default behavior without an explicit axis parameter). + Checking if all values in each column are True (the default behavior without an explicit axis parameter). >>> df.all() A True @@ -2688,7 +2688,7 @@ def prod(self, axis=0, *, numeric_only: bool = False): [3 rows x 2 columns] - Calculating the product of each column(the default behavior without an explicit axis parameter). + Calculating the product of each column (the default behavior without an explicit axis parameter). >>> df.prod() A 6.0 @@ -2721,6 +2721,33 @@ def min(self, axis=0, *, numeric_only: bool = False): If you want the *index* of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Finding the minimum value in each column (the default behavior without an explicit axis parameter). + + >>> df.min() + A 1.0 + B 2.0 + dtype: Float64 + + Finding the minimum value in each row. + + >>> df.min(axis=1) + 0 1.0 + 1 3.0 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2739,6 +2766,33 @@ def max(self, axis=0, *, numeric_only: bool = False): If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Finding the maximum value in each column (the default behavior without an explicit axis parameter). + + >>> df.max() + A 3.0 + B 4.0 + dtype: Float64 + + Finding the maximum value in each row. + + >>> df.max(axis=1) + 0 2.0 + 1 4.0 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2756,6 +2810,33 @@ def sum(self, axis=0, *, numeric_only: bool = False): This is equivalent to the method ``numpy.sum``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Calculating the sum of each column (the default behavior without an explicit axis parameter). + + >>> df.sum() + A 4.0 + B 6.0 + dtype: Float64 + + Calculating the sum of each row. + + >>> df.sum(axis=1) + 0 3.0 + 1 7.0 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. From b62a07a95cd60f995a48825c9874822d0eb02483 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 24 Nov 2023 00:10:18 +0000 Subject: [PATCH 15/26] docs: code samples for `Series.dot` and `DataFrame.dot` (#226) --- bigframes/dataframe.py | 3 +- bigframes/operations/base.py | 2 +- tests/system/small/test_dataframe.py | 23 ++++++ .../bigframes_vendored/pandas/core/frame.py | 71 +++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 15 ++++ 5 files changed, 112 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1f1275e217..8567296e29 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2797,7 +2797,8 @@ def get_right_id(id): result = result[other_frame.columns] if isinstance(other, bf_series.Series): - result = result[other.name].rename() + # There should be exactly one column in the result + result = result[result.columns[0]].rename() return result diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index d33befe4da..85ce1dd9e6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -141,7 +141,7 @@ def _apply_binary_op( if isinstance(other, pd.Series): # TODO: Convert to BigQuery DataFrames series raise NotImplementedError( - f"Pandas series not supported supported as operand. {constants.FEEDBACK_LINK}" + f"Pandas series not supported as operand. {constants.FEEDBACK_LINK}" ) if isinstance(other, series.Series): (left, right, block) = self._align(other, how=alignment) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a0cf25807c..e25e9ce501 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3493,6 +3493,29 @@ def test_df_dot_operator( ) +def test_df_dot_series_inline(): + left = [[1, 2, 3], [2, 5, 7]] + right = [2, 1, 3] + + bf1 = dataframe.DataFrame(left) + bf2 = series.Series(right) + bf_result = bf1.dot(bf2).to_pandas() + + df1 = pd.DataFrame(left) + df2 = pd.Series(right) + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas result is int64 instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e41ac905aa..a1aac5d2b5 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3485,6 +3485,77 @@ def dot(self, other): The dot method for Series computes the inner product, instead of the matrix product here. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> left + 0 1 2 3 + 0 0 1 -2 -1 + 1 1 1 1 1 + + [2 rows x 4 columns] + >>> right = bpd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> right + 0 1 + 0 0 1 + 1 1 2 + 2 -1 -1 + 3 2 0 + + [4 rows x 2 columns] + >>> left.dot(right) + 0 1 + 0 1 4 + 1 2 2 + + [2 rows x 2 columns] + + You can also use the operator ``@`` for the dot product: + + >>> left @ right + 0 1 + 0 1 4 + 1 2 2 + + [2 rows x 2 columns] + + The right input can be a Series, in which case the result will also be a + Series: + + >>> right = bpd.Series([1, 2, -1,0]) + >>> left @ right + 0 4 + 1 2 + dtype: Int64 + + Any user defined index of the left matrix and columns of the right + matrix will reflect in the result. + + >>> left = bpd.DataFrame([[1, 2, 3], [2, 5, 7]], index=["alpha", "beta"]) + >>> left + 0 1 2 + alpha 1 2 3 + beta 2 5 7 + + [2 rows x 3 columns] + >>> right = bpd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]], columns=["red", "green", "blue"]) + >>> right + red green blue + 0 2 4 8 + 1 1 5 10 + 2 3 6 9 + + [3 rows x 3 columns] + >>> left.dot(right) + red green blue + alpha 13 32 55 + beta 30 75 129 + + [2 rows x 3 columns] + Args: other (Series or DataFrame): The other object to compute the matrix product with. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a86765a412..1b751ed83b 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -631,6 +631,21 @@ def dot(self, other) -> Series | np.ndarray: BigQuery Dataframes does not validate this property and will produce incorrect results if indices are not equal. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0, 1, 2, 3]) + >>> other = bpd.Series([-1, 2, -3, 4]) + >>> s.dot(other) + 8 + + You can also use the operator ``@`` for the dot product: + + >>> s @ other + 8 + Args: other (Series): The other object to compute the dot product with its columns. From f9c6e727e2b901310bb5301da449d616ea85e135 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 28 Nov 2023 10:48:21 -0800 Subject: [PATCH 16/26] docs: add examples for dataframe.kurt, dataframe.std, dataframe.count (#232) * docs: add examples for dataframe.kurt, dataframe.std, dataframe.count * update count example * update count example * update examples * update . to : --- .../bigframes_vendored/pandas/core/frame.py | 96 +++++++++++++++++-- 1 file changed, 87 insertions(+), 9 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index a1aac5d2b5..a7018ed3a2 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2597,14 +2597,14 @@ def any(self, *, axis=0, bool_only: bool = False): [2 rows x 2 columns] - Checking if each column contains at least one True element (the default behavior without an explicit axis parameter). + Checking if each column contains at least one True element(the default behavior without an explicit axis parameter): >>> df.any() A True B False dtype: boolean - Checking if each row contains at least one True element. + Checking if each row contains at least one True element: >>> df.any(axis=1) 0 True @@ -2644,14 +2644,14 @@ def all(self, axis=0, *, bool_only: bool = False): [2 rows x 2 columns] - Checking if all values in each column are True (the default behavior without an explicit axis parameter). + Checking if all values in each column are True(the default behavior without an explicit axis parameter): >>> df.all() A True B False dtype: boolean - Checking across rows to see if all values are True. + Checking across rows to see if all values are True: >>> df.all(axis=1) 0 False @@ -2688,14 +2688,14 @@ def prod(self, axis=0, *, numeric_only: bool = False): [3 rows x 2 columns] - Calculating the product of each column (the default behavior without an explicit axis parameter). + Calculating the product of each column(the default behavior without an explicit axis parameter): >>> df.prod() A 6.0 B 160.875 dtype: Float64 - Calculating the product of each row. + Calculating the product of each row: >>> df.prod(axis=1) 0 4.5 @@ -2911,11 +2911,37 @@ def skew(self, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def kurt(self, *, numeric_only: bool = False): - """Return unbiased kurtosis over requested axis. + """Return unbiased kurtosis over columns. Kurtosis obtained using Fisher's definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], + ... "B": [3, 4, 3, 2, 1], + ... "C": [2, 2, 3, 2, 2]}) + >>> df + A B C + 0 1 3 2 + 1 2 4 2 + 2 3 3 3 + 3 4 2 2 + 4 5 1 2 + + [5 rows x 3 columns] + + Calculating the kurtosis value of each column: + + >>> df.kurt() + A -1.2 + B -0.177515 + C 5.0 + dtype: Float64 + Args: numeric_only (bool, default False): Include only float, int, boolean columns. @@ -2926,10 +2952,36 @@ def kurt(self, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std(self, *, numeric_only: bool = False): - """Return sample standard deviation over requested axis. + """Return sample standard deviation over columns. Normalized by N-1 by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], + ... "B": [3, 4, 3, 2, 1], + ... "C": [2, 2, 3, 2, 2]}) + >>> df + A B C + 0 1 3 2 + 1 2 4 2 + 2 3 3 3 + 3 4 2 2 + 4 5 1 2 + + [5 rows x 3 columns] + + Calculating the standard deviation of each column: + + >>> df.std() + A 1.581139 + B 1.140175 + C 0.447214 + dtype: Float64 + Args: numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -2941,11 +2993,37 @@ def std(self, *, numeric_only: bool = False): def count(self, *, numeric_only: bool = False): """ - Count non-NA cells for each column or row. + Count non-NA cells for each column. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending on `pandas.options.mode.use_inf_as_na`) are considered NA. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], + ... "B": [1, 2, 3, 4, 5], + ... "C": [None, 3.5, None, 4.5, 5.0]}) + >>> df + A B C + 0 1.0 1 + 1 2 3.5 + 2 3.0 3 + 3 4.0 4 4.5 + 4 5.0 5 5.0 + + [5 rows x 3 columns] + + Counting non-NA values for each column: + + >>> df.count() + A 4.0 + B 5.0 + C 3.0 + dtype: Float64 + Args: numeric_only (bool, default False): Include only `float`, `int` or `boolean` data. From edd0522747eadb74780124fb18ed7face251441d Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 28 Nov 2023 11:40:28 -0800 Subject: [PATCH 17/26] =?UTF-8?q?docs:=20add=20examples=20for=20dataframe.?= =?UTF-8?q?mean,=20dataframe.median,=20dataframe.va=E2=80=A6=20(#228)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add examples for dataframe.mean, dataframe.median, dataframe.var and dataframe.skew * column to columns * update var example --- .../bigframes_vendored/pandas/core/frame.py | 105 +++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index a7018ed3a2..2a8972f2e5 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2852,6 +2852,33 @@ def sum(self, axis=0, *, numeric_only: bool = False): def mean(self, axis=0, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Calculating the mean of each column (the default behavior without an explicit axis parameter). + + >>> df.mean() + A 2.0 + B 3.0 + dtype: Float64 + + Calculating the mean of each row. + + >>> df.mean(axis=1) + 0 1.5 + 1 3.5 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2865,7 +2892,27 @@ def mean(self, axis=0, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median(self, *, numeric_only: bool = False, exact: bool = False): - """Return the median of the values over the requested axis. + """Return the median of the values over colunms. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Finding the median value of each column. + + >>> df.median() + A 1.0 + B 2.0 + dtype: Float64 Args: numeric_only (bool. default False): @@ -2884,6 +2931,34 @@ def var(self, axis=0, *, numeric_only: bool = False): Normalized by N-1 by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Calculating the variance of each column (the default behavior without an explicit axis parameter). + + >>> df.var() + A 2.0 + B 2.0 + dtype: Float64 + + Calculating the variance of each row. + + >>> df.var(axis=1) + 0 0.5 + 1 0.5 + dtype: Float64 + + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2897,10 +2972,36 @@ def var(self, axis=0, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def skew(self, *, numeric_only: bool = False): - """Return unbiased skew over requested axis. + """Return unbiased skew over columns. Normalized by N-1. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], + ... 'B': [5, 4, 3, 2, 1], + ... 'C': [2, 2, 3, 2, 2]}) + >>> df + A B C + 0 1 5 2 + 1 2 4 2 + 2 3 3 3 + 3 4 2 2 + 4 5 1 2 + + [5 rows x 3 columns] + + Calculating the skewness of each column. + + >>> df.skew() + A 0.0 + B 0.0 + C 2.236068 + dtype: Float64 + Args: numeric_only (bool, default False): Include only float, int, boolean columns. From ae03756f5ee45e0e74e0c0bdd4777e018eba2273 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 28 Nov 2023 12:53:51 -0800 Subject: [PATCH 18/26] fix: make to_pandas override enable_downsampling when sampling_method is manually set. (#200) * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set. --- bigframes/core/blocks.py | 41 ++++++++++++++++------------ tests/system/small/test_dataframe.py | 11 ++++++++ 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f1113d938e..34913872e7 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -389,23 +389,6 @@ def to_pandas( ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" - if max_download_size is None: - max_download_size = bigframes.options.sampling.max_download_size - if sampling_method is None: - sampling_method = ( - bigframes.options.sampling.sampling_method - if bigframes.options.sampling.sampling_method is not None - else _UNIFORM - ) - if random_state is None: - random_state = bigframes.options.sampling.random_state - - sampling_method = sampling_method.lower() - if sampling_method not in _SAMPLING_METHODS: - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) df, _, query_job = self._compute_and_count( value_keys=value_keys, @@ -453,6 +436,28 @@ def _compute_and_count( ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. + enable_downsampling = ( + True + if sampling_method is not None + else bigframes.options.sampling.enable_downsampling + ) + + max_download_size = ( + max_download_size or bigframes.options.sampling.max_download_size + ) + + random_state = random_state or bigframes.options.sampling.random_state + + if sampling_method is None: + sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM + sampling_method = sampling_method.lower() + + if sampling_method not in _SAMPLING_METHODS: + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + expr = self._apply_value_keys_to_expr(value_keys=value_keys) results_iterator, query_job = expr.start_query( @@ -469,7 +474,7 @@ def _compute_and_count( ) if fraction < 1: - if not bigframes.options.sampling.enable_downsampling: + if not enable_downsampling: raise RuntimeError( f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e25e9ce501..9b9567418b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3546,3 +3546,14 @@ def test_df_dot_operator_series( bf_result, pd_result, ) + + +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + df = df.to_pandas(max_download_size=download_size, sampling_method="head") + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.3) From 9d6613d318b558722b7bab12773efdea4bbe9931 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 28 Nov 2023 13:46:18 -0800 Subject: [PATCH 19/26] feat: add info and memory_usage methods to dataframe (#219) --- .pre-commit-config.yaml | 2 +- bigframes/_config/display_options.py | 4 + bigframes/core/indexes/index.py | 13 ++- bigframes/dataframe.py | 84 +++++++++++++++++++ bigframes/dtypes.py | 13 +++ noxfile.py | 1 + setup.py | 1 + tests/system/small/test_dataframe.py | 42 ++++++++++ .../pandas/core/config_init.py | 11 +++ .../bigframes_vendored/pandas/core/frame.py | 66 +++++++++++++++ 10 files changed, 235 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e0fd8b98f..517176da89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ repos: rev: v1.1.1 hooks: - id: mypy - additional_dependencies: [types-requests] + additional_dependencies: [types-requests, types-tabulate] diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index ad3ea3f68c..afa36aa84c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -32,6 +32,10 @@ class DisplayOptions: progress_bar: Optional[str] = "auto" repr_mode: Literal["head", "deferred"] = "head" + max_info_columns: int = 100 + max_info_rows: Optional[int] = 200000 + memory_usage: bool = True + @contextlib.contextmanager def pandas_repr(display_options: DisplayOptions): diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6c66c36062..fc7cf167d4 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -155,6 +155,14 @@ def _block(self) -> blocks.Block: def T(self) -> Index: return self.transpose() + def _memory_usage(self) -> int: + (n_rows,) = self.shape + return sum( + self.dtypes.map( + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows + ) + ) + def transpose(self) -> Index: return self @@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: def __getitem__(self, key: int) -> typing.Any: if isinstance(key, int): - result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() + if key != -1: + result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() + else: # special case, want [-1:] instead of [-1:0] + result_pd_df, _ = self._block.slice(key).to_pandas() if result_pd_df.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.index[0] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8567296e29..f7796291b9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -18,6 +18,7 @@ import datetime import re +import sys import textwrap import typing from typing import ( @@ -36,6 +37,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +import tabulate import bigframes import bigframes._config.display_options as display_options @@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]: self._set_internal_query_job(self._compute_dry_run()) return self._query_job + def memory_usage(self, index: bool = True): + n_rows, _ = self.shape + # like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object + column_sizes = self.dtypes.map( + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows + ) + if index: + index_size = pandas.Series([self.index._memory_usage()], index=["Index"]) + column_sizes = pandas.concat([index_size, column_sizes]) + return column_sizes + + def info( + self, + verbose: Optional[bool] = None, + buf=None, + max_cols: Optional[int] = None, + memory_usage: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): + obuf = buf or sys.stdout + + n_rows, n_columns = self.shape + + max_cols = ( + max_cols + if max_cols is not None + else bigframes.options.display.max_info_columns + ) + + show_all_columns = verbose if verbose is not None else (n_columns < max_cols) + + obuf.write(f"{type(self)}\n") + + index_type = "MultiIndex" if self.index.nlevels > 1 else "Index" + + # These accessses are kind of expensive, maybe should try to skip? + first_indice = self.index[0] + last_indice = self.index[-1] + obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n") + + dtype_strings = self.dtypes.astype("string") + if show_all_columns: + obuf.write(f"Data columns (total {n_columns} columns):\n") + column_info = self.columns.to_frame(name="Column") + + max_rows = bigframes.options.display.max_info_rows + too_many_rows = n_rows > max_rows if max_rows is not None else False + + if show_counts if show_counts is not None else (not too_many_rows): + non_null_counts = self.count().to_pandas() + column_info["Non-Null Count"] = non_null_counts.map( + lambda x: f"{int(x)} non-null" + ) + + column_info["Dtype"] = dtype_strings + + column_info = column_info.reset_index(drop=True) + column_info.index.name = "#" + + column_info_formatted = tabulate.tabulate(column_info, headers="keys") # type: ignore + obuf.write(column_info_formatted) + obuf.write("\n") + + else: # Just number of columns and first, last + obuf.write( + f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n" + ) + dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items() + dtype_counts_formatted = ", ".join( + f"{dtype}({count})" for dtype, count in dtype_counts + ) + obuf.write(f"dtypes: {dtype_counts_formatted}\n") + + show_memory = ( + memory_usage + if memory_usage is not None + else bigframes.options.display.memory_usage + ) + if show_memory: + # TODO: Convert to different units (kb, mb, etc.) + obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n") + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index cd35e380c0..774eb74d06 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -143,6 +143,19 @@ # "string" and "string[pyarrow] are accepted" BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow") +# For the purposes of dataframe.memory_usage +# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes +DTYPE_BYTE_SIZES = { + pd.BooleanDtype(): 1, + pd.Int64Dtype(): 8, + pd.Float32Dtype(): 8, + pd.StringDtype(): 8, + pd.ArrowDtype(pa.time64("us")): 8, + pd.ArrowDtype(pa.timestamp("us")): 8, + pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8, + pd.ArrowDtype(pa.date32()): 8, +} + def ibis_dtype_to_bigframes_dtype( ibis_dtype: ibis_dtypes.DataType, diff --git a/noxfile.py b/noxfile.py index 8d6d641fc1..c1fb53f794 100644 --- a/noxfile.py +++ b/noxfile.py @@ -228,6 +228,7 @@ def mypy(session): "types-python-dateutil", "types-requests", "types-setuptools", + "types-tabulate", ] ) | set(SYSTEM_TEST_STANDARD_DEPENDENCIES) diff --git a/setup.py b/setup.py index 29eacb74a9..abf165b3df 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ "requests >=2.27.1", "scikit-learn >=1.2.2", "sqlalchemy >=1.4,<3.0dev", + "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", ] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9b9567418b..9744d3f6e9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import operator import tempfile import typing @@ -255,6 +256,47 @@ def test_drop_with_custom_column_labels(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 13 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null object\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null object\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 945 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + def test_drop_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 198654015e..dfb91dfeb8 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -33,6 +33,17 @@ Instead estimated bytes processed will be shown. Dataframe and Series objects can still be computed with methods that explicitly execute and download results. + max_info_columns (int): + max_info_columns is used in DataFrame.info method to decide if + per column information will be printed. + max_info_rows (int or None): + df.info() will usually show null-counts for each column. + For large frames this can be quite slow. max_info_rows and max_info_cols + limit this null check only to frames with smaller dimensions than + specified. + memory_usage (bool): + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. Valid values True,False, """ sampling_options_doc = """ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 2a8972f2e5..099d8b8e66 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -92,6 +92,72 @@ def values(self) -> np.ndarray: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def info( + self, + verbose: bool | None = None, + buf=None, + max_cols: int | None = None, + memory_usage: bool | None = None, + show_counts: bool | None = None, + ) -> None: + """ + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtypeand columns, non-null values and memory usage. + + Args: + verbose (bool, optional): + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf (writable buffer, defaults to sys.stdout): + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols (int, optional): + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage (bool, optional): + Specifies whether total memory usage of the DataFrame + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + True always show memory usage. False never shows memory usage. + Memory estimation is made based in column dtype and number of rows + assuming values consume the same memory amount for corresponding dtypes. + show_counts (bool, optional): + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns: + None: This method prints a summary of a DataFrame and returns None.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def memory_usage(self, index: bool = True): + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Args: + index (bool, default True): + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + + Returns: + Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # IO methods (to / from other formats) def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: From 69b016eae7ea97d84ceeb22ba09f5472841db072 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 28 Nov 2023 23:42:58 +0000 Subject: [PATCH 20/26] fix: use anonymous dataset to create `remote_function` (#205) * fix: use anonymous dataset to create `remote_function` * update README about anonymous dataset instead of bigframes_temp_location * remove dataset creation step from remote function This is because now the dataset is an anonymous dataset that must have been created previously as part of bigframes session creation. * restore create_dataset, guarded by get_dataset --- README.rst | 7 ++-- bigframes/remote_function.py | 19 ++++++--- bigframes/session/__init__.py | 14 ------- tests/system/large/test_remote_function.py | 47 +++++++++++++++++++++- tests/system/small/test_remote_function.py | 37 +++++------------ 5 files changed, 73 insertions(+), 51 deletions(-) diff --git a/README.rst b/README.rst index 5ddb4a7639..91dac12751 100644 --- a/README.rst +++ b/README.rst @@ -267,10 +267,9 @@ definition. To view and manage connections, do the following: 3. In the Explorer pane, expand that project and then expand External connections. BigQuery remote functions are created in the dataset you specify, or -in a dataset with the name ``bigframes_temp_location``, where location is -the location used by the BigQuery DataFrames session. For example, -``bigframes_temp_us_central1``. To view and manage remote functions, do -the following: +in a special type of `hidden dataset `__ +referred to as an anonymous dataset. To view and manage remote functions created +in a user provided dataset, do the following: 1. Go to `BigQuery in the Google Cloud Console `__. 2. Select the project in which you created the remote function. diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index a39cd033f6..7280ac7d42 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -188,6 +188,7 @@ def create_bq_remote_function( # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 bq_function_args = [] bq_function_return_type = BigQueryType.from_ibis(output_type) + # We are expecting the input type annotations to be 1:1 with the input args for idx, name in enumerate(input_args): bq_function_args.append( @@ -204,14 +205,22 @@ def create_bq_remote_function( logger.info(f"Creating BQ remote function: {create_function_ddl}") - # Make sure the dataset exists + # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and + # create it dataset = bigquery.Dataset( bigquery.DatasetReference.from_string( self._bq_dataset, default_project=self._gcp_project_id ) ) dataset.location = self._bq_location - self._bq_client.create_dataset(dataset, exists_ok=True) + try: + # This check does not require bigquery.datasets.create IAM + # permission. So, if the data set already exists, then user can work + # without having that permission. + self._bq_client.get_dataset(dataset) + except google.api_core.exceptions.NotFound: + # This requires bigquery.datasets.create IAM permission + self._bq_client.create_dataset(dataset, exists_ok=True) # TODO: Use session._start_query() so we get progress bar query_job = self._bq_client.query(create_function_ddl) # Make an API request. @@ -610,7 +619,7 @@ def get_routine_reference( raise DatasetMissingError dataset_ref = bigquery.DatasetReference( - bigquery_client.project, session._session_dataset_id + bigquery_client.project, session._anonymous_dataset.dataset_id ) return dataset_ref.routine(routine_ref_str) @@ -778,9 +787,7 @@ def remote_function( dataset, default_project=bigquery_client.project ) else: - dataset_ref = bigquery.DatasetReference.from_string( - session._session_dataset_id, default_project=bigquery_client.project - ) + dataset_ref = session._anonymous_dataset bq_location, cloud_function_region = get_remote_function_locations( bigquery_client.location diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 928123ce74..d2f6137883 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -198,13 +198,6 @@ def cloudfunctionsclient(self): def resourcemanagerclient(self): return self._clients_provider.resourcemanagerclient - @property - def _session_dataset_id(self): - """A dataset for storing temporary objects local to the session - This is a workaround for remote functions that do not - yet support session-temporary instances.""" - return self._session_dataset.dataset_id - @property def _project(self): return self.bqclient.project @@ -229,13 +222,6 @@ def _create_bq_datasets(self): query_destination.dataset_id, ) - # Dataset for storing remote functions, which don't yet - # support proper session temporary storage yet - self._session_dataset = bigquery.Dataset( - f"{self.bqclient.project}.bigframes_temp_{self._location.lower().replace('-', '_')}" - ) - self._session_dataset.location = self._location - def close(self): """No-op. Temporary resources are deleted after 7 days.""" diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 6ed3e6511a..5cb4df188c 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -22,7 +22,7 @@ import textwrap from google.api_core.exceptions import NotFound, ResourceExhausted -from google.cloud import functions_v2 +from google.cloud import bigquery, functions_v2 import pandas import pytest import test_utils.prefixer @@ -1210,3 +1210,48 @@ def square(x): cleanup_remote_function_assets( session.bqclient, session.cloudfunctionsclient, square ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_anonymous_dataset(session, scalars_dfs): + try: + # This usage of remote_function is expected to create the remote + # function in the bigframes session's anonymous dataset. Use reuse=False + # param to make sure parallel instances of the test don't step over each + # other due to the common anonymous dataset. + @session.remote_function([int], int, reuse=False) + def square(x): + return x * x + + assert ( + bigquery.Routine(square.bigframes_remote_function).dataset_id + == session._anonymous_dataset.dataset_id + ) + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 3d8532a13b..960a384126 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -62,13 +62,12 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection_and_permanent_dataset( +def session_with_bq_connection( bq_cf_connection, dataset_id_permanent ) -> bigframes.Session: session = bigframes.Session( bigframes.BigQueryOptions(bq_connection=bq_cf_connection) ) - session._session_dataset = bigquery.Dataset(dataset_id_permanent) return session @@ -277,13 +276,11 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): @rf.remote_function( [int], int, - session=session_with_bq_connection_and_permanent_dataset, + session=session_with_bq_connection, ) def square(x): return x * x @@ -313,9 +310,7 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -323,7 +318,7 @@ def test_remote_function_via_session_default( # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection_and_permanent_dataset.remote_function([int], int) + @session_with_bq_connection.remote_function([int], int) def square(x): return x * x @@ -391,15 +386,11 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( - [int], int - )(add_one) + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -422,15 +413,11 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( - [int], int - )(add_one) + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -451,13 +438,11 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map(session_with_bq_connection_and_permanent_dataset, scalars_dfs): +def test_series_map(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( - [int], int - )(add_one) + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) scalars_df, scalars_pandas_df = scalars_dfs From f8917abc094e222e0435891d4d184b77bfe67722 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 28 Nov 2023 18:24:16 -0800 Subject: [PATCH 21/26] fix: update the llm+kmeans notebook with recent change (#236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 313682530 🦕 --- .../bq_dataframes_llm_kmeans.ipynb | 47 +++++-------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 8d75950925..5f74046fc0 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -366,18 +366,6 @@ "predicted_embeddings.head() " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4H_etYfsEOFP" - }, - "outputs": [], - "source": [ - "# Join the complaints with their embeddings in the same DataFrame\n", - "combined_df = downsampled_issues_df.join(predicted_embeddings)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -426,30 +414,19 @@ "outputs": [], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", - "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", + "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", + "clustered_result = cluster_model.predict(predicted_embeddings)\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", "clustered_result.head(n=5)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Join the group number to the complaints and their text embeddings\n", - "combined_clustered_result = combined_df.join(clustered_result)\n", - "combined_clustered_result.head(n=5) " - ] - }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." + "Our dataframe combined_clustered_result now has three complaint columns: the content, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." ] }, { @@ -480,14 +457,14 @@ "source": [ "# Using bigframes, with syntax identical to pandas,\n", "# filter out the first and second groups\n", - "cluster_1_result = combined_clustered_result[\n", - " combined_clustered_result[\"CENTROID_ID\"] == 1\n", - "][[\"consumer_complaint_narrative\"]]\n", + "cluster_1_result = clustered_result[\n", + " clustered_result[\"CENTROID_ID\"] == 1\n", + "][[\"content\"]]\n", "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n", "\n", - "cluster_2_result = combined_clustered_result[\n", - " combined_clustered_result[\"CENTROID_ID\"] == 2\n", - "][[\"consumer_complaint_narrative\"]]\n", + "cluster_2_result = clustered_result[\n", + " clustered_result[\"CENTROID_ID\"] == 2\n", + "][[\"content\"]]\n", "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()" ] }, @@ -503,15 +480,15 @@ "prompt1 = 'comment list 1:\\n'\n", "for i in range(5):\n", " prompt1 += str(i + 1) + '. ' + \\\n", - " cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", + " cluster_1_result_pandas[\"content\"].iloc[i] + '\\n'\n", "\n", "prompt2 = 'comment list 2:\\n'\n", "for i in range(5):\n", " prompt2 += str(i + 1) + '. ' + \\\n", - " cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", + " cluster_2_result_pandas[\"content\"].iloc[i] + '\\n'\n", "\n", "print(prompt1)\n", - "print(prompt2)\n" + "print(prompt2)" ] }, { From fbc31ab0fb166e6ffab48b3507614fef566ec64c Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 29 Nov 2023 03:52:20 +0000 Subject: [PATCH 22/26] test: fix prerelease tests (#239) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The heads of google-cloud-bigquery and google-cloud-bigquery-storage packages are installed with -e. This is leading to `from google.cloud import bigquery` looking into the folder of the latter and running into "ImportError: cannot import name 'bigquery' from 'google.cloud' (.../google-cloud-bigquery-storage/google/cloud/__init__.py). Removing -e from google-cloud-bigquery installation gets rid of this error. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 313701211 🦕 --- noxfile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index c1fb53f794..3b10a37fc7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -548,7 +548,6 @@ def prerelease(session: nox.sessions.Session, tests_path): # Ensure we catch breaking changes in the client libraries early. session.install( "--upgrade", - "-e", "git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery", ) already_installed.add("google-cloud-bigquery") From d0d9b84b101eb03c499d85e74dcfc900dedd4137 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Wed, 29 Nov 2023 09:15:41 -0800 Subject: [PATCH 23/26] fix: add df snapshots lookup for `read_gbq` (#229) --- bigframes/pandas/__init__.py | 6 ++ bigframes/session/__init__.py | 56 +++++++++++-------- bigframes/session/_io/bigquery.py | 5 -- tests/system/small/test_session.py | 18 ++++++ tests/unit/session/test_io_bigquery.py | 14 ----- .../bigframes_vendored/pandas/io/gbq.py | 3 + 6 files changed, 59 insertions(+), 43 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index d35f838366..0c2c1f87aa 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -486,6 +486,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( @@ -494,6 +495,7 @@ def read_gbq( index_col=index_col, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) @@ -516,6 +518,7 @@ def read_gbq_query( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( @@ -524,6 +527,7 @@ def read_gbq_query( index_col=index_col, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) @@ -536,6 +540,7 @@ def read_gbq_table( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( @@ -544,6 +549,7 @@ def read_gbq_table( index_col=index_col, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index d2f6137883..84a6eb5638 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -177,6 +177,7 @@ def __init__( # Now that we're starting the session, don't allow the options to be # changed. context._session_started = True + self._df_snapshot: Dict[bigquery.TableReference, datetime.datetime] = {} @property def bqclient(self): @@ -232,6 +233,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. @@ -242,6 +244,7 @@ def read_gbq( col_order=col_order, max_results=max_results, api_name="read_gbq", + use_cache=use_cache, ) else: # TODO(swast): Query the snapshot table but mark it as a @@ -253,6 +256,7 @@ def read_gbq( col_order=col_order, max_results=max_results, api_name="read_gbq", + use_cache=use_cache, ) def _query_to_destination( @@ -260,6 +264,7 @@ def _query_to_destination( query: str, index_cols: List[str], api_name: str, + use_cache: bool = True, ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]: # If a dry_run indicates this is not a query type job, then don't # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. @@ -284,6 +289,7 @@ def _query_to_destination( job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name job_config.destination = temp_table + job_config.use_query_cache = use_cache try: # Write to temp table to workaround BigQuery 10 GB query results @@ -305,6 +311,7 @@ def read_gbq_query( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> dataframe.DataFrame: """Turn a SQL query into a DataFrame. @@ -362,6 +369,7 @@ def read_gbq_query( col_order=col_order, max_results=max_results, api_name="read_gbq_query", + use_cache=use_cache, ) def _read_gbq_query( @@ -372,6 +380,7 @@ def _read_gbq_query( col_order: Iterable[str] = (), max_results: Optional[int] = None, api_name: str = "read_gbq_query", + use_cache: bool = True, ) -> dataframe.DataFrame: if isinstance(index_col, str): index_cols = [index_col] @@ -379,7 +388,10 @@ def _read_gbq_query( index_cols = list(index_col) destination, query_job = self._query_to_destination( - query, index_cols, api_name=api_name + query, + index_cols, + api_name=api_name, + use_cache=use_cache, ) # If there was no destination table, that means the query must have @@ -403,6 +415,7 @@ def _read_gbq_query( index_col=index_cols, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) def read_gbq_table( @@ -412,6 +425,7 @@ def read_gbq_table( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> dataframe.DataFrame: """Turn a BigQuery table into a DataFrame. @@ -434,6 +448,7 @@ def read_gbq_table( col_order=col_order, max_results=max_results, api_name="read_gbq_table", + use_cache=use_cache, ) def _get_snapshot_sql_and_primary_key( @@ -441,6 +456,7 @@ def _get_snapshot_sql_and_primary_key( table_ref: bigquery.table.TableReference, *, api_name: str, + use_cache: bool = True, ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]: """Create a read-only Ibis table expression representing a table. @@ -448,19 +464,6 @@ def _get_snapshot_sql_and_primary_key( column(s), then return those too so that ordering generation can be avoided. """ - if table_ref.dataset_id.upper() == "_SESSION": - # _SESSION tables aren't supported by the tables.get REST API. - return ( - self.ibis_client.sql( - f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" - ), - None, - ) - table_expression = self.ibis_client.table( - table_ref.table_id, - database=f"{table_ref.project}.{table_ref.dataset_id}", - ) - # If there are primary keys defined, the query engine assumes these # columns are unique, even if the constraint is not enforced. We make # the same assumption and use these columns as the total ordering keys. @@ -481,14 +484,18 @@ def _get_snapshot_sql_and_primary_key( job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name - current_timestamp = list( - self.bqclient.query( - "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", - job_config=job_config, - ).result() - )[0][0] + if use_cache and table_ref in self._df_snapshot.keys(): + snapshot_timestamp = self._df_snapshot[table_ref] + else: + snapshot_timestamp = list( + self.bqclient.query( + "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", + job_config=job_config, + ).result() + )[0][0] + self._df_snapshot[table_ref] = snapshot_timestamp table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + bigframes_io.create_snapshot_sql(table_ref, snapshot_timestamp) ) return table_expression, primary_keys @@ -500,12 +507,11 @@ def _read_gbq_table( col_order: Iterable[str] = (), max_results: Optional[int] = None, api_name: str, + use_cache: bool = True, ) -> dataframe.DataFrame: if max_results and max_results <= 0: raise ValueError("`max_results` should be a positive number.") - # TODO(swast): Can we re-use the temp table from other reads in the - # session, if the original table wasn't modified? table_ref = bigquery.table.TableReference.from_string( query, default_project=self.bqclient.project ) @@ -513,7 +519,9 @@ def _read_gbq_table( ( table_expression, total_ordering_cols, - ) = self._get_snapshot_sql_and_primary_key(table_ref, api_name=api_name) + ) = self._get_snapshot_sql_and_primary_key( + table_ref, api_name=api_name, use_cache=use_cache + ) for key in col_order: if key not in table_expression.columns: diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index dae73301e7..4770f12089 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -117,11 +117,6 @@ def create_snapshot_sql( table_ref: bigquery.TableReference, current_timestamp: datetime.datetime ) -> str: """Query a table via 'time travel' for consistent reads.""" - - # If we have a _SESSION table, assume that it's already a copy. Nothing to do here. - if table_ref.dataset_id.upper() == "_SESSION": - return f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" - # If we have an anonymous query results table, it can't be modified and # there isn't any BigQuery time travel. if table_ref.dataset_id.startswith("_"): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 7cd9f1dd59..26c5093b35 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -16,6 +16,7 @@ import random import tempfile import textwrap +import time import typing from typing import List @@ -308,6 +309,23 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str): assert df["statement_type"][0] == "SCRIPT" +def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id): + df1 = session.read_gbq(penguins_table_id) + time.sleep(1) + df2 = session.read_gbq(penguins_table_id) + df1.columns = [ + "species1", + "island1", + "culmen_length_mm1", + "culmen_depth_mm1", + "flipper_length_mm1", + "body_mass_g1", + "sex1", + ] + df3 = df1.join(df2) + assert df3 is not None + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index c87835c412..3f3bfbe7d3 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -147,20 +147,6 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): assert "`my-test-project`.`_e8166e0cdb`.`anonbb92cd`" in sql -def test_create_snapshot_sql_doesnt_timetravel_session_tables(): - table_ref = bigquery.TableReference.from_string("my-test-project._session.abcdefg") - - sql = bigframes.session._io.bigquery.create_snapshot_sql( - table_ref, datetime.datetime.now(datetime.timezone.utc) - ) - - # We aren't modifying _SESSION tables, so don't use time travel. - assert "SYSTEM_TIME" not in sql - - # Don't need the project ID for _SESSION tables. - assert "my-test-project" not in sql - - def test_create_temp_table_default_expiration(): """Make sure the created table has an expiration.""" bqclient = mock.create_autospec(bigquery.Client) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 2161310b07..eabb48e600 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -16,6 +16,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ): """Loads a DataFrame from BigQuery. @@ -83,6 +84,8 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. + use_cache (bool, default True): + Whether to cache the query inputs. Default to True. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. From 0bfc4fb117686c734d4a2503d5a6de0e64e9f9b9 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 29 Nov 2023 11:44:16 -0800 Subject: [PATCH 24/26] feat: add remote vertex model support (#237) b/299356085 --- bigframes/ml/__init__.py | 1 + bigframes/ml/core.py | 8 ++ bigframes/ml/remote.py | 157 +++++++++++++++++++++++++ bigframes/ml/sql.py | 26 +++- docs/reference/bigframes.ml/index.rst | 2 + docs/reference/bigframes.ml/remote.rst | 7 ++ docs/templates/toc.yml | 6 + tests/system/small/ml/conftest.py | 41 +++++++ tests/system/small/ml/test_core.py | 16 +++ tests/system/small/ml/test_remote.py | 33 ++++++ tests/unit/ml/test_sql.py | 26 ++++ 11 files changed, 319 insertions(+), 4 deletions(-) create mode 100644 bigframes/ml/remote.py create mode 100644 docs/reference/bigframes.ml/remote.rst create mode 100644 tests/system/small/ml/test_remote.py diff --git a/bigframes/ml/__init__.py b/bigframes/ml/__init__.py index 55c8709d8d..b2c62ff961 100644 --- a/bigframes/ml/__init__.py +++ b/bigframes/ml/__init__.py @@ -26,4 +26,5 @@ "llm", "forecasting", "imported", + "remote", ] diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index d8135f7085..5aad77a394 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -294,6 +294,8 @@ def create_remote_model( self, session: bigframes.Session, connection_name: str, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: """Create a session-temporary BQML remote model with the CREATE OR REPLACE MODEL statement @@ -301,6 +303,10 @@ def create_remote_model( Args: connection_name: a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection + input: + input schema for general remote models + output: + output schema for general remote models options: a dict of options to configure the model. Generates a BQML OPTIONS clause @@ -311,6 +317,8 @@ def create_remote_model( sql = self._model_creation_sql_generator.create_remote_model( connection_name=connection_name, model_ref=model_ref, + input=input, + output=output, options=options, ) diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py new file mode 100644 index 0000000000..d4c34bbd0d --- /dev/null +++ b/bigframes/ml/remote.py @@ -0,0 +1,157 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BigFrames general remote models.""" + +from __future__ import annotations + +from typing import Mapping, Optional, Union +import warnings + +import bigframes +from bigframes import clients +from bigframes.core import log_adapter +from bigframes.ml import base, core, globals, utils +import bigframes.pandas as bpd + +_SUPPORTED_DTYPES = ( + "bool", + "string", + "int64", + "float64", + "array", + "array", + "array", + "array", +) + +_REMOTE_MODEL_STATUS = "remote_model_status" + + +@log_adapter.class_logger +class VertexAIModel(base.BaseEstimator): + """Remote model from a Vertex AI https endpoint. User must specify https endpoint, input schema and output schema. + How to deploy a model in Vertex AI https://cloud.google.com/bigquery/docs/bigquery-ml-remote-model-tutorial#Deploy-Model-on-Vertex-AI. + + Args: + endpoint (str): + Vertex AI https endpoint. + input ({column_name: column_type}): + Input schema. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". + output ({column_name: column_type}): + Output label schema. Supported the same types as the input. + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach + permission if the connection isn't fully setup. + """ + + def __init__( + self, + endpoint: str, + input: Mapping[str, str], + output: Mapping[str, str], + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.endpoint = endpoint + self.input = input + self.output = output + self.session = session or bpd.get_global_session() + + self._bq_connection_manager = clients.BqConnectionManager( + self.session.bqconnectionclient, self.session.resourcemanagerclient + ) + connection_name = connection_name or self.session._bq_connection + self.connection_name = self._bq_connection_manager.resolve_full_connection_name( + connection_name, + default_project=self.session._project, + default_location=self.session._location, + ) + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + + options = { + "endpoint": self.endpoint, + } + + def standardize_type(v: str): + v = v.lower() + v = v.replace("boolean", "bool") + + if v not in _SUPPORTED_DTYPES: + raise ValueError( + f"Data type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}." + ) + + return v + + self.input = {k: standardize_type(v) for k, v in self.input.items()} + self.output = {k: standardize_type(v) for k, v in self.output.items()} + + return self._bqml_model_factory.create_remote_model( + session=self.session, + connection_name=self.connection_name, + input=self.input, + output=self.output, + options=options, + ) + + def predict( + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: + """Predict the result from the input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, which needs to comply with the input parameter of the model. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.predict(X) + + # unlike LLM models, the general remote model status is null for successful runs. + if (df[_REMOTE_MODEL_STATUS].notna()).any(): + warnings.warn( + f"Some predictions failed. Check column {_REMOTE_MODEL_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index ab051231fb..1c88eda4ab 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -57,6 +57,12 @@ def build_expressions(self, *expr_sqls: str) -> str: indent_str = " " return "\n" + indent_str + f",\n{indent_str}".join(expr_sqls) + def build_schema(self, **kwargs: str) -> str: + """Encode a dict of values into a formatted schema type items for SQL""" + indent_str = " " + param_strs = [f"{k} {v}" for k, v in kwargs.items()] + return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + def options(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode the OPTIONS clause for BQML""" return f"OPTIONS({self.build_parameters(**kwargs)})" @@ -65,6 +71,14 @@ def struct_options(self, **kwargs: Union[int, float]) -> str: """Encode a BQ STRUCT as options.""" return f"STRUCT({self.build_structs(**kwargs)})" + def input(self, **kwargs: str) -> str: + """Encode a BQML INPUT clause.""" + return f"INPUT({self.build_schema(**kwargs)})" + + def output(self, **kwargs: str) -> str: + """Encode a BQML OUTPUT clause.""" + return f"OUTPUT({self.build_schema(**kwargs)})" + # Connection def connection(self, conn_name: str) -> str: """Encode the REMOTE WITH CONNECTION clause for BQML. conn_name is of the format ...""" @@ -154,15 +168,19 @@ def create_remote_model( self, connection_name: str, model_ref: google.cloud.bigquery.ModelReference, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" - options_sql = self.options(**options) - parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] + if input: + parts.append(self.input(**input)) + if output: + parts.append(self.output(**output)) parts.append(self.connection(connection_name)) - if options_sql: - parts.append(options_sql) + if options: + parts.append(self.options(**options)) return "\n".join(parts) def create_imported_model( diff --git a/docs/reference/bigframes.ml/index.rst b/docs/reference/bigframes.ml/index.rst index f3cbe1174a..1975d62e6d 100644 --- a/docs/reference/bigframes.ml/index.rst +++ b/docs/reference/bigframes.ml/index.rst @@ -30,3 +30,5 @@ API Reference pipeline preprocessing + + remote diff --git a/docs/reference/bigframes.ml/remote.rst b/docs/reference/bigframes.ml/remote.rst new file mode 100644 index 0000000000..7827acfe92 --- /dev/null +++ b/docs/reference/bigframes.ml/remote.rst @@ -0,0 +1,7 @@ +bigframes.ml.remote +=================== + +.. automodule:: bigframes.ml.remote + :members: + :inherited-members: + :undoc-members: diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 9879721d28..58ac1c0efe 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -108,6 +108,12 @@ - name: PaLM2TextEmbeddingGenerator uid: bigframes.ml.llm.PaLM2TextEmbeddingGenerator name: llm + - items: + - name: Overview + uid: bigframes.ml.remote + - name: VertexAIModel + uid: bigframes.ml.remote.VertexAIModel + name: remote - items: - name: metrics uid: bigframes.ml.metrics diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index c11445b79a..c4a1272e44 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -29,6 +29,7 @@ imported, linear_model, llm, + remote, ) @@ -247,6 +248,46 @@ def palm2_embedding_generator_multilingual_model( ) +@pytest.fixture(scope="session") +def linear_remote_model_params() -> dict: + # Pre-deployed endpoint of linear reg model in Vertex. + # bigframes-test-linreg2 -> bigframes-test-linreg-endpoint2 + return { + "input": {"culmen_length_mm": "float64"}, + "output": {"predicted_body_mass_g": "array"}, + "endpoint": "https://us-central1-aiplatform.googleapis.com/v1/projects/1084210331973/locations/us-central1/endpoints/3193318217619603456", + } + + +@pytest.fixture(scope="session") +def bqml_linear_remote_model( + session, bq_connection, linear_remote_model_params +) -> core.BqmlModel: + options = { + "endpoint": linear_remote_model_params["endpoint"], + } + return globals.bqml_model_factory().create_remote_model( + session=session, + input=linear_remote_model_params["input"], + output=linear_remote_model_params["output"], + connection_name=bq_connection, + options=options, + ) + + +@pytest.fixture(scope="session") +def linear_remote_vertex_model( + session, bq_connection, linear_remote_model_params +) -> remote.VertexAIModel: + return remote.VertexAIModel( + endpoint=linear_remote_model_params["endpoint"], + input=linear_remote_model_params["input"], + output=linear_remote_model_params["output"], + session=session, + connection_name=bq_connection, + ) + + @pytest.fixture(scope="session") def time_series_bqml_arima_plus_model( session, time_series_arima_plus_model_name diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index be34a4871c..22cbbb1932 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -289,6 +289,22 @@ def test_model_predict_with_unnamed_index( ) +def test_remote_model_predict( + bqml_linear_remote_model: core.BqmlModel, new_penguins_df +): + predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() + expected = pd.DataFrame( + {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[["predicted_body_mass_g"]].sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_model_generate_text( bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py new file mode 100644 index 0000000000..e8eb1c85e8 --- /dev/null +++ b/tests/system/small/ml/test_remote.py @@ -0,0 +1,33 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from bigframes.ml import remote + + +def test_remote_linear_vertex_model_predict( + linear_remote_vertex_model: remote.VertexAIModel, new_penguins_df +): + predictions = linear_remote_vertex_model.predict(new_penguins_df).to_pandas() + expected = pd.DataFrame( + {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[["predicted_body_mass_g"]].sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index ea16722393..9223058540 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -190,6 +190,32 @@ def test_create_remote_model_produces_correct_sql( ) +def test_create_remote_model_with_params_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_remote_model( + connection_name="my_project.us.my_connection", + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_remote_model" + ), + input={"column1": "int64"}, + output={"result": "array"}, + options={"option_key1": "option_value1", "option_key2": 2}, + ) + assert ( + sql + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` +INPUT( + column1 int64) +OUTPUT( + result array) +REMOTE WITH CONNECTION `my_project.us.my_connection` +OPTIONS( + option_key1="option_value1", + option_key2=2)""" + ) + + def test_create_imported_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): From 6c899be2989e24f697d72fe1bb92ebbf7dec84cb Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 29 Nov 2023 13:12:15 -0800 Subject: [PATCH 25/26] chore: release 0.15.0 (#241) Release-As: 0.15.0 From 8089b15feddaeb9c56a8f976b439315fcfed0301 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:46:09 -0800 Subject: [PATCH 26/26] chore(main): release 0.15.0 (#214) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 45 ++++++++++++++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 091967513a..ef75a017e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,51 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.15.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.14.1...v0.15.0) (2023-11-29) + + +### ⚠ BREAKING CHANGES + +* model.predict returns all the columns ([#204](https://github.com/googleapis/python-bigquery-dataframes/issues/204)) + +### Features + +* Add info and memory_usage methods to dataframe ([#219](https://github.com/googleapis/python-bigquery-dataframes/issues/219)) ([9d6613d](https://github.com/googleapis/python-bigquery-dataframes/commit/9d6613d318b558722b7bab12773efdea4bbe9931)) +* Add remote vertex model support ([#237](https://github.com/googleapis/python-bigquery-dataframes/issues/237)) ([0bfc4fb](https://github.com/googleapis/python-bigquery-dataframes/commit/0bfc4fb117686c734d4a2503d5a6de0e64e9f9b9)) +* Add the recent api method for ML component ([#225](https://github.com/googleapis/python-bigquery-dataframes/issues/225)) ([ed8876d](https://github.com/googleapis/python-bigquery-dataframes/commit/ed8876d3439a3b45b65e8789737c3c2e3a7f1adb)) +* Model.predict returns all the columns ([#204](https://github.com/googleapis/python-bigquery-dataframes/issues/204)) ([416171a](https://github.com/googleapis/python-bigquery-dataframes/commit/416171a70d91d4a6b71622ba72685147ab7d6186)) +* Send warnings on LLM prediction partial failures ([#216](https://github.com/googleapis/python-bigquery-dataframes/issues/216)) ([81125f9](https://github.com/googleapis/python-bigquery-dataframes/commit/81125f9505ad98e89939769a8e1fcf30518705f0)) + + +### Bug Fixes + +* Add df snapshots lookup for `read_gbq` ([#229](https://github.com/googleapis/python-bigquery-dataframes/issues/229)) ([d0d9b84](https://github.com/googleapis/python-bigquery-dataframes/commit/d0d9b84b101eb03c499d85e74dcfc900dedd4137)) +* Avoid unnecessary row_number() on sort key for io ([#211](https://github.com/googleapis/python-bigquery-dataframes/issues/211)) ([a18d40e](https://github.com/googleapis/python-bigquery-dataframes/commit/a18d40e808ee0822d21715cc3e8f794c418aeebc)) +* Dedup special character ([#209](https://github.com/googleapis/python-bigquery-dataframes/issues/209)) ([dd78acb](https://github.com/googleapis/python-bigquery-dataframes/commit/dd78acb174545ba292776a642afcec46f8ee4a2a)) +* Invalid JSON type of the notebook ([#215](https://github.com/googleapis/python-bigquery-dataframes/issues/215)) ([a729831](https://github.com/googleapis/python-bigquery-dataframes/commit/a7298317ea2604faa6ae31817f1f729d7e0b9818)) +* Make to_pandas override enable_downsampling when sampling_method is manually set. ([#200](https://github.com/googleapis/python-bigquery-dataframes/issues/200)) ([ae03756](https://github.com/googleapis/python-bigquery-dataframes/commit/ae03756f5ee45e0e74e0c0bdd4777e018eba2273)) +* Polish the llm+kmeans notebook ([#208](https://github.com/googleapis/python-bigquery-dataframes/issues/208)) ([e8532b1](https://github.com/googleapis/python-bigquery-dataframes/commit/e8532b1d999d26ea1ebdd30efb8f2c0a93a6a28d)) +* Update the llm+kmeans notebook with recent change ([#236](https://github.com/googleapis/python-bigquery-dataframes/issues/236)) ([f8917ab](https://github.com/googleapis/python-bigquery-dataframes/commit/f8917abc094e222e0435891d4d184b77bfe67722)) +* Use anonymous dataset to create `remote_function` ([#205](https://github.com/googleapis/python-bigquery-dataframes/issues/205)) ([69b016e](https://github.com/googleapis/python-bigquery-dataframes/commit/69b016eae7ea97d84ceeb22ba09f5472841db072)) + + +### Documentation + +* Add code samples for `index` and `column` properties ([#212](https://github.com/googleapis/python-bigquery-dataframes/issues/212)) ([c88d38e](https://github.com/googleapis/python-bigquery-dataframes/commit/c88d38e69682f4c620174086b8f16f4780c04811)) +* Add code samples for df reshaping, function, merge, and join methods ([#203](https://github.com/googleapis/python-bigquery-dataframes/issues/203)) ([010486c](https://github.com/googleapis/python-bigquery-dataframes/commit/010486c3494e05d714da6cc7d51514518d9ae1ea)) +* Add examples for dataframe.kurt, dataframe.std, dataframe.count ([#232](https://github.com/googleapis/python-bigquery-dataframes/issues/232)) ([f9c6e72](https://github.com/googleapis/python-bigquery-dataframes/commit/f9c6e727e2b901310bb5301da449d616ea85e135)) +* Add examples for dataframe.mean, dataframe.median, dataframe.va… ([#228](https://github.com/googleapis/python-bigquery-dataframes/issues/228)) ([edd0522](https://github.com/googleapis/python-bigquery-dataframes/commit/edd0522747eadb74780124fb18ed7face251441d)) +* Add examples for dataframe.min, dataframe.max and dataframe.sum ([#227](https://github.com/googleapis/python-bigquery-dataframes/issues/227)) ([3a375e8](https://github.com/googleapis/python-bigquery-dataframes/commit/3a375e87b64b8fb51370bfec8f2cfdbcd8fe960a)) +* Code samples for `Series.dot` and `DataFrame.dot` ([#226](https://github.com/googleapis/python-bigquery-dataframes/issues/226)) ([b62a07a](https://github.com/googleapis/python-bigquery-dataframes/commit/b62a07a95cd60f995a48825c9874822d0eb02483)) +* Code samples for `Series.where` and `Series.mask` ([#217](https://github.com/googleapis/python-bigquery-dataframes/issues/217)) ([52dfad2](https://github.com/googleapis/python-bigquery-dataframes/commit/52dfad281def82548751a276ce42b087dbb09f9a)) +* Code samples for dataframe.any, dataframe.all and dataframe.prod ([#223](https://github.com/googleapis/python-bigquery-dataframes/issues/223)) ([d7957fa](https://github.com/googleapis/python-bigquery-dataframes/commit/d7957fad071d223ef8f6fb8f3de395c865ff60aa)) +* Make the code samples reflect default bq connection usage ([#206](https://github.com/googleapis/python-bigquery-dataframes/issues/206)) ([71844b0](https://github.com/googleapis/python-bigquery-dataframes/commit/71844b03cdbfe684320c186a0488c8c7fb4fcd6e)) + + +### Miscellaneous Chores + +* Release 0.15.0 ([#241](https://github.com/googleapis/python-bigquery-dataframes/issues/241)) ([6c899be](https://github.com/googleapis/python-bigquery-dataframes/commit/6c899be2989e24f697d72fe1bb92ebbf7dec84cb)) + ## [0.14.1](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.14.0...v0.14.1) (2023-11-16) diff --git a/bigframes/version.py b/bigframes/version.py index 46e57e5b88..920cb95c3d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.14.1" +__version__ = "0.15.0"