From e8532b1d999d26ea1ebdd30efb8f2c0a93a6a28d Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 16 Nov 2023 10:08:27 -0800 Subject: [PATCH 01/26] fix: polish the llm+kmeans notebook (#208) --- .../bq_dataframes_llm_code_generation.ipynb | 2 +- .../bq_dataframes_llm_kmeans.ipynb | 1181 +++++++++++++++-- 2 files changed, 1057 insertions(+), 126 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 0f113b84c6..0a41447a53 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -34,7 +34,7 @@ "
\n",
- " \n",
+ " \n",
" ![]() | \n",
diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
index 46c4955288..ae03813639 100644
--- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
+++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
@@ -31,7 +31,7 @@
"
\n",
- " \n",
+ " \n",
" ![]() | \n",
@@ -118,14 +118,10 @@
"\n",
"2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
"\n",
- "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n",
+ "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,aiplatform.googleapis.com) to enable the following APIs:\n",
"\n",
" * BigQuery API\n",
" * BigQuery Connection API\n",
- " * Cloud Run API\n",
- " * Artifact Registry API\n",
- " * Cloud Build API\n",
- " * Cloud Resource Manager API\n",
" * Vertex AI API\n",
"\n",
"4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)."
@@ -143,9 +139,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Updated property [core/project].\n"
+ ]
+ }
+ ],
"source": [
"# set your project ID below\n",
"PROJECT_ID = \"\" # @param {type:\"string\"}\n",
@@ -166,7 +170,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -232,87 +236,6 @@
"# auth.authenticate_user()"
]
},
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Connect to Vertex AI\n",
- "\n",
- "In order to use PaLM2TextGenerator, we will need to set up a [cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from google.cloud import bigquery_connection_v1 as bq_connection\n",
- "\n",
- "CONN_NAME = \"bqdf-llm\"\n",
- "\n",
- "client = bq_connection.ConnectionServiceClient()\n",
- "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}\"\n",
- "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n",
- "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n",
- "\n",
- "try:\n",
- " request = client.get_connection(\n",
- " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n",
- " )\n",
- " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n",
- "except Exception:\n",
- " connection = bq_connection.types.Connection(\n",
- " {\"friendly_name\": CONN_NAME, \"cloud_resource\": cloud_resource_properties}\n",
- " )\n",
- " request = bq_connection.CreateConnectionRequest(\n",
- " {\n",
- " \"parent\": new_conn_parent,\n",
- " \"connection_id\": CONN_NAME,\n",
- " \"connection\": connection,\n",
- " }\n",
- " )\n",
- " response = client.create_connection(request)\n",
- " CONN_SERVICE_ACCOUNT = (\n",
- " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n",
- " )\n",
- "print(CONN_SERVICE_ACCOUNT)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Set permissions for the service account\n",
- "\n",
- "The resource connection service account requires certain project-level permissions:\n",
- " - `roles/aiplatform.user` and `roles/bigquery.connectionUser`: These roles are required for the connection to create a model definition using the LLM model in Vertex AI ([documentation](https://cloud.google.com/bigquery/docs/generate-text#give_the_service_account_access)).\n",
- " - `roles/run.invoker`: This role is required for the connection to have read-only access to Cloud Run services that back custom/remote functions ([documentation](https://cloud.google.com/bigquery/docs/remote-functions#grant_permission_on_function)).\n",
- "\n",
- "Set these permissions by running the following `gcloud` commands:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n",
- "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n",
- "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'"
- ]
- },
{
"attachments": {},
"cell_type": "markdown",
@@ -336,12 +259,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Project Setup"
+ "BigQuery DataFrames setup"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"id": "R7STCS8xB5d2"
},
@@ -353,6 +276,14 @@
"bf.options.bigquery.location = REGION"
]
},
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location."
+ ]
+ },
{
"attachments": {},
"cell_type": "markdown",
@@ -365,7 +296,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"id": "zDSwoBo1CU3G"
},
@@ -376,11 +307,101 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"id": "tYDoaKgJChiq"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job 9f096761-e3b5-4d58-a9f7-485ced67afca is DONE. 2.3 GB processed. Open Job"
+ ],
+ "text/plain": [
+ "
\n", + " | consumer_complaint_narrative | \n", + "
---|---|
0 | \n", + "I signed a contract as a condition of employme... | \n", + "
1 | \n", + "First, I want to disclose that XXXX and XXXX b... | \n", + "
2 | \n", + "Frequent calls from Focused Receivables Manage... | \n", + "
3 | \n", + "I recently contacted Enhanced Recovery Company... | \n", + "
4 | \n", + "This began when I subscribed to XXXX XXXX inte... | \n", + "
5 rows × 1 columns
\n", + "\n", + " | text_embedding | \n", + "
---|---|
422 | \n", + "[-0.012013785541057587, 0.003669967409223318, ... | \n", + "
616 | \n", + "[-0.014948881231248379, -0.04672442376613617, ... | \n", + "
833 | \n", + "[-0.01951478235423565, -0.027120858430862427, ... | \n", + "
1370 | \n", + "[-0.03140445053577423, -0.048797041177749634, ... | \n", + "
1430 | \n", + "[-0.02244548313319683, -0.03336532413959503, 0... | \n", + "
5 rows × 1 columns
\n", + "\n", + " | consumer_complaint_narrative | \n", + "text_embedding | \n", + "
---|---|---|
2580664 | \n", + "Hello, my name is XXXX XXXX, and I am writing ... | \n", + "[0.0003211698785889894, -0.01816680282354355, ... | \n", + "
1806973 | \n", + "This is XXXX XXXX and I am submitting this com... | \n", + "[-0.009485247544944286, -0.025846892967820168,... | \n", + "
2055053 | \n", + "XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... | \n", + "[-0.010950954630970955, -0.0249345600605011, 0... | \n", + "
2515231 | \n", + "When I reinvestigated my credit report, I real... | \n", + "[-0.009660656563937664, -0.05793113633990288, ... | \n", + "
2633049 | \n", + "Checking my credit report XX/XX/2018 with all ... | \n", + "[-0.0022159104701131582, -0.03330004960298538,... | \n", + "
3117273 | \n", + "I contacted TransUnion and spoke a credit rep ... | \n", + "[-0.015955328941345215, -0.006488671060651541,... | \n", + "
698814 | \n", + "XXXX XXXX XXXX. makes daily calls to me cell c... | \n", + "[0.005397460889071226, -0.01276913657784462, 0... | \n", + "
267826 | \n", + "Can we please reopen Case : XXXX? \n", + "\n", + "Wells Farg... | \n", + "[0.004065403249114752, -0.0005381882656365633,... | \n", + "
54019 | \n", + "My rights under 15 USC 1681 have been violated... | \n", + "[0.013823015615344048, -0.02010691538453102, 0... | \n", + "
141050 | \n", + "To whom it may concern : My personal informati... | \n", + "[0.008104532025754452, -0.01856449618935585, 0... | \n", + "
2962076 | \n", + "I have had a CashApp account since last year, ... | \n", + "[-0.0003019514260813594, -0.03750108182430267,... | \n", + "
2481105 | \n", + "that some of the information was erroneous. Th... | \n", + "[-0.014868081547319889, -0.0443895161151886, -... | \n", + "
431562 | \n", + "I have disputed the referenced accounts to the... | \n", + "[-0.0020524838473647833, -0.04830990731716156,... | \n", + "
1953029 | \n", + "On, XX/XX/22, I attempted to complete a transa... | \n", + "[-0.01599179394543171, -0.0074900356121361256,... | \n", + "
2395979 | \n", + "Subject : XXXX XXXX XXXX compensation, refund,... | \n", + "[-0.0035950862802565098, -0.014652969315648079... | \n", + "
455524 | \n", + "I paid off my mortgage on XX/XX/2019. The comp... | \n", + "[-0.01100730150938034, -0.03495829552412033, 0... | \n", + "
2155924 | \n", + "This kind of account is placed as a charged of... | \n", + "[-0.028635455295443535, -0.028604287654161453,... | \n", + "
1069497 | \n", + "This is one of many issues I have had with Wel... | \n", + "[0.008871790021657944, -0.028502725064754486, ... | \n", + "
3181689 | \n", + "I have disputed this account with MONTEREY FIN... | \n", + "[-0.004721717908978462, -0.03673810139298439, ... | \n", + "
274268 | \n", + "Lender is not updating my loan status in the V... | \n", + "[-0.009221495129168034, -0.0289347805082798, 0... | \n", + "
1671305 | \n", + "XXXX is a peer to peer lending conmpany that u... | \n", + "[-0.02911308966577053, -0.01850792020559311, -... | \n", + "
886026 | \n", + "( DISPUTE CODE - XXXX ) My personal informatio... | \n", + "[-0.007220877334475517, -0.016615957021713257,... | \n", + "
1044431 | \n", + "I filed a complaint against PNC this year and ... | \n", + "[0.002848619595170021, -0.035117778927087784, ... | \n", + "
1938481 | \n", + "I applied for a modification and was approved.... | \n", + "[-0.03114932030439377, -0.0421406552195549, 0.... | \n", + "
1987834 | \n", + "Ive been Disputting my XXXX XXXX I opened this... | \n", + "[-0.009406660683453083, -0.020967338234186172,... | \n", + "
25 rows × 2 columns
\n", + "\n", + " | CENTROID_ID | \n", + "
---|---|
422 | \n", + "2 | \n", + "
616 | \n", + "3 | \n", + "
833 | \n", + "5 | \n", + "
1370 | \n", + "7 | \n", + "
1430 | \n", + "3 | \n", + "
5 rows × 1 columns
\n", + "\n", + " | consumer_complaint_narrative | \n", + "text_embedding | \n", + "CENTROID_ID | \n", + "
---|---|---|---|
2580664 | \n", + "Hello, my name is XXXX XXXX, and I am writing ... | \n", + "[0.0003211698785889894, -0.01816680282354355, ... | \n", + "2 | \n", + "
1806973 | \n", + "This is XXXX XXXX and I am submitting this com... | \n", + "[-0.009485247544944286, -0.025846892967820168,... | \n", + "5 | \n", + "
2055053 | \n", + "XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... | \n", + "[-0.010950954630970955, -0.0249345600605011, 0... | \n", + "3 | \n", + "
2515231 | \n", + "When I reinvestigated my credit report, I real... | \n", + "[-0.009660656563937664, -0.05793113633990288, ... | \n", + "5 | \n", + "
2633049 | \n", + "Checking my credit report XX/XX/2018 with all ... | \n", + "[-0.0022159104701131582, -0.03330004960298538,... | \n", + "3 | \n", + "
5 rows × 3 columns
\n", + "25 rows × 1 columns
\n", - "[67 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "penguin_id \n", - "3 3394.118128\n", - "8 4048.685642\n", - "17 3976.454093\n", - "23 3541.582194\n", - "25 4032.844186\n", - "27 4118.351772\n", - "29 4087.767826\n", - "34 3183.755249\n", - "35 3418.802274\n", - "39 3519.186468\n", - "51 3398.135365\n", - "52 3223.615957\n", - "60 3445.014718\n", - "61 3505.638864\n", - "64 3515.905786\n", - "65 4028.363185\n", - "67 4159.993943\n", - "83 3348.16883\n", - "85 3485.050273\n", - "93 4172.874548\n", - "104 3299.302424\n", - "105 3515.687917\n", - "108 3405.224618\n", - "113 4209.140425\n", - "130 4197.905737\n", - "...\n", + "25 rows × 7 columns
\n", + "[67 rows x 7 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g onehotencoded_island \\\n", + "penguin_id \n", + "1 3781.402407 [{'index': 3, 'value': 1.0}] \n", + "4 4124.107944 [{'index': 1, 'value': 1.0}] \n", + "8 4670.344196 [{'index': 1, 'value': 1.0}] \n", + "11 3529.417214 [{'index': 2, 'value': 1.0}] \n", + "13 4014.101714 [{'index': 1, 'value': 1.0}] \n", + "15 5212.41288 [{'index': 1, 'value': 1.0}] \n", + "16 4163.595615 [{'index': 3, 'value': 1.0}] \n", + "23 3392.453069 [{'index': 2, 'value': 1.0}] \n", + "34 4698.305397 [{'index': 1, 'value': 1.0}] \n", + "36 4828.226949 [{'index': 1, 'value': 1.0}] \n", + "42 3430.58866 [{'index': 1, 'value': 1.0}] \n", + "48 5314.260221 [{'index': 1, 'value': 1.0}] \n", + "61 5363.205372 [{'index': 1, 'value': 1.0}] \n", + "64 4855.908314 [{'index': 1, 'value': 1.0}] \n", + "65 3413.100524 [{'index': 2, 'value': 1.0}] \n", + "68 3340.219002 [{'index': 3, 'value': 1.0}] \n", + "70 4228.73157 [{'index': 2, 'value': 1.0}] \n", + "72 3811.538478 [{'index': 2, 'value': 1.0}] \n", + "74 4659.770763 [{'index': 1, 'value': 1.0}] \n", + "77 3453.388804 [{'index': 2, 'value': 1.0}] \n", + "81 4766.245033 [{'index': 1, 'value': 1.0}] \n", + "91 4057.807281 [{'index': 2, 'value': 1.0}] \n", + "96 4739.827445 [{'index': 1, 'value': 1.0}] \n", + "105 3394.891976 [{'index': 1, 'value': 1.0}] \n", + "111 3201.493683 [{'index': 1, 'value': 1.0}] \n", "\n", - "[67 rows x 1 columns]" + " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "1 -0.938587 0.748033 \n", + "4 -0.16745 0.899528 \n", + "8 0.453222 -1.877885 \n", + "11 -1.12667 0.697535 \n", + "13 -1.183094 1.404513 \n", + "15 0.867003 -0.766919 \n", + "16 -1.784958 1.959995 \n", + "23 -0.355532 0.647036 \n", + "34 -0.600039 -1.776888 \n", + "36 -0.129833 -1.423399 \n", + "42 -1.615684 -0.514427 \n", + "48 0.415606 -0.716421 \n", + "61 0.396797 -1.170907 \n", + "64 0.434414 -1.120408 \n", + "65 -1.220711 1.051024 \n", + "68 -1.484026 -0.009443 \n", + "70 1.638141 1.404513 \n", + "72 0.829387 0.142052 \n", + "74 -0.242683 -1.524396 \n", + "77 -1.277136 -0.211437 \n", + "81 0.208715 -1.221405 \n", + "91 1.261976 0.647036 \n", + "96 0.246331 -1.322402 \n", + "105 -1.803766 0.445043 \n", + "111 -1.164286 0.697535 \n", + "\n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", + "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "8 0.658942 [{'index': 1, 'value': 1.0}] \n", + "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", + "15 0.513833 [{'index': 2, 'value': 1.0}] \n", + "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", + "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", + "34 0.949161 [{'index': 1, 'value': 1.0}] \n", + "36 1.23938 [{'index': 1, 'value': 1.0}] \n", + "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", + "48 1.021716 [{'index': 2, 'value': 1.0}] \n", + "61 1.457044 [{'index': 2, 'value': 1.0}] \n", + "64 1.09427 [{'index': 1, 'value': 1.0}] \n", + "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", + "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "70 0.296168 [{'index': 2, 'value': 1.0}] \n", + "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", + "74 0.586387 [{'index': 1, 'value': 1.0}] \n", + "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", + "81 0.804051 [{'index': 1, 'value': 1.0}] \n", + "91 0.005949 [{'index': 2, 'value': 1.0}] \n", + "96 0.731497 [{'index': 1, 'value': 1.0}] \n", + "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "8 [{'index': 3, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 3, 'value': 1.0}] \n", + "16 [{'index': 1, 'value': 1.0}] \n", + "23 [{'index': 1, 'value': 1.0}] \n", + "34 [{'index': 3, 'value': 1.0}] \n", + "36 [{'index': 3, 'value': 1.0}] \n", + "42 [{'index': 1, 'value': 1.0}] \n", + "48 [{'index': 3, 'value': 1.0}] \n", + "61 [{'index': 3, 'value': 1.0}] \n", + "64 [{'index': 3, 'value': 1.0}] \n", + "65 [{'index': 1, 'value': 1.0}] \n", + "68 [{'index': 1, 'value': 1.0}] \n", + "70 [{'index': 2, 'value': 1.0}] \n", + "72 [{'index': 2, 'value': 1.0}] \n", + "74 [{'index': 3, 'value': 1.0}] \n", + "77 [{'index': 1, 'value': 1.0}] \n", + "81 [{'index': 3, 'value': 1.0}] \n", + "91 [{'index': 2, 'value': 1.0}] \n", + "96 [{'index': 3, 'value': 1.0}] \n", + "105 [{'index': 1, 'value': 1.0}] \n", + "111 [{'index': 1, 'value': 1.0}] \n", + "\n", + "[67 rows x 7 columns]" ] }, - "execution_count": 25, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -2423,18 +2554,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7a16e04253a42b7a5ce247d8f63b656", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 027042f1-9a18-43d8-a378-ab9410e395b1 is DONE. 23.5 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6f19614c-82c0-4f8b-b74b-9d91a894efdd is RUNNING. " ] }, "metadata": {}, @@ -2442,13 +2571,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a99ac15431e433595de1040872a4558", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 6c8484a0-a504-4e50-93d6-3d247c9ff558 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 51899e2d-f6ef-4e62-98b6-c11550f74f4b is RUNNING. " ] }, "metadata": {}, @@ -2456,13 +2583,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "90909b620e084f59b0f9da266257593f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e81ca2de-df2e-41ec-af86-14f8dcec1b44 is DONE. 6.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 44d3fddc-74bc-4de0-a458-2c73b38f74fb is RUNNING. " ] }, "metadata": {}, @@ -2470,13 +2595,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2a9c2c05041a4fb691809bab5310bb05", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 3e6d413c-f8c4-4390-95eb-3a1f5bc59aed is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 33584475-f02b-4c98-9a51-e29996f4f950 is RUNNING. " ] }, "metadata": {}, @@ -2484,13 +2607,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6b0677c228d54b409c66e5dfa98d7e00", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e448220d-0c50-45b7-bcbe-d1159b3d18ce is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job df25ba49-280e-424d-a357-dde71a9b35dd is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2498,13 +2619,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "379ae6497fb34f969d21b2cd664e8bfa", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e167a234-828d-4f05-8654-63cf97e50ba3 is DONE. 10.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6f92a04e-af7e-41d6-9303-6366c1751294 is RUNNING. " ] }, "metadata": {}, @@ -2532,152 +2651,452 @@ "25 rows × 1 columns
\n", - "[67 rows x 1 columns in total]" + "25 rows × 8 columns
\n", + "[67 rows x 8 columns in total]" ], "text/plain": [ - " CENTROID_ID\n", - "penguin_id \n", - "3 3\n", - "8 3\n", - "17 3\n", - "23 1\n", - "25 3\n", - "27 3\n", - "29 3\n", - "34 3\n", - "35 1\n", - "39 3\n", - "51 1\n", - "52 3\n", - "60 3\n", - "61 3\n", - "64 1\n", - "65 1\n", - "67 3\n", - "83 3\n", - "85 1\n", - "93 1\n", - "104 3\n", - "105 1\n", - "108 3\n", - "113 3\n", - "130 1\n", - "...\n", + " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", + "penguin_id \n", + "1 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035... \n", + "4 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856... \n", + "8 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975... \n", + "11 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012... \n", + "13 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949... \n", + "15 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772... \n", + "16 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720... \n", + "23 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483... \n", + "34 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302... \n", + "36 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190... \n", + "42 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930... \n", + "48 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196... \n", + "61 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282... \n", + "64 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625... \n", + "65 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148... \n", + "68 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449... \n", + "70 4 [{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955... \n", + "72 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009... \n", + "74 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546... \n", + "77 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663... \n", + "81 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073... \n", + "91 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086... \n", + "96 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227... \n", + "105 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029... \n", + "111 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385... \n", "\n", - "[67 rows x 1 columns]" + " onehotencoded_island standard_scaled_culmen_length_mm \\\n", + "penguin_id \n", + "1 [{'index': 3, 'value': 1.0}] -0.938587 \n", + "4 [{'index': 1, 'value': 1.0}] -0.16745 \n", + "8 [{'index': 1, 'value': 1.0}] 0.453222 \n", + "11 [{'index': 2, 'value': 1.0}] -1.12667 \n", + "13 [{'index': 1, 'value': 1.0}] -1.183094 \n", + "15 [{'index': 1, 'value': 1.0}] 0.867003 \n", + "16 [{'index': 3, 'value': 1.0}] -1.784958 \n", + "23 [{'index': 2, 'value': 1.0}] -0.355532 \n", + "34 [{'index': 1, 'value': 1.0}] -0.600039 \n", + "36 [{'index': 1, 'value': 1.0}] -0.129833 \n", + "42 [{'index': 1, 'value': 1.0}] -1.615684 \n", + "48 [{'index': 1, 'value': 1.0}] 0.415606 \n", + "61 [{'index': 1, 'value': 1.0}] 0.396797 \n", + "64 [{'index': 1, 'value': 1.0}] 0.434414 \n", + "65 [{'index': 2, 'value': 1.0}] -1.220711 \n", + "68 [{'index': 3, 'value': 1.0}] -1.484026 \n", + "70 [{'index': 2, 'value': 1.0}] 1.638141 \n", + "72 [{'index': 2, 'value': 1.0}] 0.829387 \n", + "74 [{'index': 1, 'value': 1.0}] -0.242683 \n", + "77 [{'index': 2, 'value': 1.0}] -1.277136 \n", + "81 [{'index': 1, 'value': 1.0}] 0.208715 \n", + "91 [{'index': 2, 'value': 1.0}] 1.261976 \n", + "96 [{'index': 1, 'value': 1.0}] 0.246331 \n", + "105 [{'index': 1, 'value': 1.0}] -1.803766 \n", + "111 [{'index': 1, 'value': 1.0}] -1.164286 \n", + "\n", + " standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "1 0.748033 \n", + "4 0.899528 \n", + "8 -1.877885 \n", + "11 0.697535 \n", + "13 1.404513 \n", + "15 -0.766919 \n", + "16 1.959995 \n", + "23 0.647036 \n", + "34 -1.776888 \n", + "36 -1.423399 \n", + "42 -0.514427 \n", + "48 -0.716421 \n", + "61 -1.170907 \n", + "64 -1.120408 \n", + "65 1.051024 \n", + "68 -0.009443 \n", + "70 1.404513 \n", + "72 0.142052 \n", + "74 -1.524396 \n", + "77 -0.211437 \n", + "81 -1.221405 \n", + "91 0.647036 \n", + "96 -1.322402 \n", + "105 0.445043 \n", + "111 0.697535 \n", + "\n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", + "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "8 0.658942 [{'index': 1, 'value': 1.0}] \n", + "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", + "15 0.513833 [{'index': 2, 'value': 1.0}] \n", + "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", + "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", + "34 0.949161 [{'index': 1, 'value': 1.0}] \n", + "36 1.23938 [{'index': 1, 'value': 1.0}] \n", + "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", + "48 1.021716 [{'index': 2, 'value': 1.0}] \n", + "61 1.457044 [{'index': 2, 'value': 1.0}] \n", + "64 1.09427 [{'index': 1, 'value': 1.0}] \n", + "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", + "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "70 0.296168 [{'index': 2, 'value': 1.0}] \n", + "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", + "74 0.586387 [{'index': 1, 'value': 1.0}] \n", + "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", + "81 0.804051 [{'index': 1, 'value': 1.0}] \n", + "91 0.005949 [{'index': 2, 'value': 1.0}] \n", + "96 0.731497 [{'index': 1, 'value': 1.0}] \n", + "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "8 [{'index': 3, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 3, 'value': 1.0}] \n", + "16 [{'index': 1, 'value': 1.0}] \n", + "23 [{'index': 1, 'value': 1.0}] \n", + "34 [{'index': 3, 'value': 1.0}] \n", + "36 [{'index': 3, 'value': 1.0}] \n", + "42 [{'index': 1, 'value': 1.0}] \n", + "48 [{'index': 3, 'value': 1.0}] \n", + "61 [{'index': 3, 'value': 1.0}] \n", + "64 [{'index': 3, 'value': 1.0}] \n", + "65 [{'index': 1, 'value': 1.0}] \n", + "68 [{'index': 1, 'value': 1.0}] \n", + "70 [{'index': 2, 'value': 1.0}] \n", + "72 [{'index': 2, 'value': 1.0}] \n", + "74 [{'index': 3, 'value': 1.0}] \n", + "77 [{'index': 1, 'value': 1.0}] \n", + "81 [{'index': 3, 'value': 1.0}] \n", + "91 [{'index': 2, 'value': 1.0}] \n", + "96 [{'index': 3, 'value': 1.0}] \n", + "105 [{'index': 1, 'value': 1.0}] \n", + "111 [{'index': 1, 'value': 1.0}] \n", + "\n", + "[67 rows x 8 columns]" ] }, - "execution_count": 26, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -2704,7 +3123,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -2721,7 +3140,7 @@ " ('linreg', LinearRegression())])" ] }, - "execution_count": 27, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -2748,18 +3167,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "887bf58cebf14bdba95db828390fd33d", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job b11be0d8-e6f1-41cb-8cb2-25a38e7ef311 is DONE. 24.7 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ed42cbb3-3d25-47ca-96c5-71a84e426a8c is RUNNING. " ] }, "metadata": {}, @@ -2767,13 +3184,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "24357055792a4eaaa60997fea0f76921", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job f32ea25c-be39-4726-a8f5-604ae83849a6 is DONE. 8.5 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 3fc74930-03b9-4a49-8ed3-c3edc4dd6e51 is RUNNING. " ] }, "metadata": {}, @@ -2781,13 +3196,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bba878d6d3e345f1a29aea50f7101e8f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 86e29b78-76f5-4937-8bde-407b99af04a2 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 38a4ce3b-5c2a-4d44-b826-f24529d6500b is RUNNING. " ] }, "metadata": {}, @@ -2795,13 +3208,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4bc2c53aeb7d4a8280f9fbbe373f4b55", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ca819734-0d41-4d9e-b743-09edae8c7fee is DONE. 29.6 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ecad776d-77c8-4d94-8186-d5571b512b62 is RUNNING. " ] }, "metadata": {}, @@ -2809,13 +3220,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f4f695cb0a224102b6e26adeb1827981", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 49bb5bed-cc84-47e0-9a90-08ab01e00548 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c9bfc58f-ce2c-47a9-bbc7-b10d9de9b5a6 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2823,13 +3232,23 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cb1df595006d485288a1060299970e5e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1e40a085-2289-47dd-afd8-820413186b9f is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 8fd8036e-3753-433d-975b-c7b42406f648 is RUNNING. " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 60319296-a480-4f51-b7ad-190ac6de963a is DONE. 6.2 kB processed. Open Job" + ], + "text/plain": [ + "25 rows × 1 columns
\n", - "[67 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "penguin_id \n", - "3 3394.116212\n", - "8 4048.683645\n", - "17 3976.452358\n", - "23 3541.580346\n", - "25 4032.842027\n", - "27 4118.34983\n", - "29 4087.765797\n", - "34 3183.75379\n", - "35 3418.800633\n", - "39 3519.18471\n", - "51 3398.133564\n", - "52 3223.614107\n", - "60 3445.012713\n", - "61 3505.637004\n", - "64 3515.903779\n", - "65 4028.361259\n", - "67 4159.991956\n", - "83 3348.167212\n", - "85 3485.048557\n", - "93 4172.872284\n", - "104 3299.300454\n", - "105 3515.68617\n", - "108 3405.222757\n", - "113 4209.13832\n", - "130 4197.90382\n", - "...\n", + "25 rows × 7 columns
\n", + "[67 rows x 7 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g island culmen_length_mm \\\n", + "penguin_id \n", + "1 3781.396682 Torgersen 39.1 \n", + "4 4124.102574 Biscoe 43.2 \n", + "8 4670.338389 Biscoe 46.5 \n", + "11 3529.411644 Dream 38.1 \n", + "13 4014.09632 Biscoe 37.8 \n", + "15 5212.407319 Biscoe 48.7 \n", + "16 4163.590502 Torgersen 34.6 \n", + "23 3392.44731 Dream 42.2 \n", + "34 4698.299674 Biscoe 40.9 \n", + "36 4828.221398 Biscoe 43.4 \n", + "42 3430.582874 Biscoe 35.5 \n", + "48 5314.254798 Biscoe 46.3 \n", + "61 5363.19995 Biscoe 46.2 \n", + "64 4855.90281 Biscoe 46.4 \n", + "65 3413.094869 Dream 37.6 \n", + "68 3340.213193 Torgersen 36.2 \n", + "70 4228.726508 Dream 52.8 \n", + "72 3811.532821 Dream 48.5 \n", + "74 4659.765013 Biscoe 42.8 \n", + "77 3453.383042 Dream 37.3 \n", + "81 4766.239424 Biscoe 45.2 \n", + "91 4057.801947 Dream 50.8 \n", + "96 4739.821792 Biscoe 45.4 \n", + "105 3394.886275 Biscoe 34.5 \n", + "111 3201.48777 Biscoe 37.9 \n", "\n", - "[67 rows x 1 columns]" + " culmen_depth_mm flipper_length_mm sex \\\n", + "penguin_id \n", + "1 18.7 181.0 MALE \n", + "4 19.0 197.0 MALE \n", + "8 13.5 210.0 FEMALE \n", + "11 18.6 190.0 FEMALE \n", + "13 20.0 190.0 MALE \n", + "15 15.7 208.0 MALE \n", + "16 21.1 198.0 MALE \n", + "23 18.5 180.0 FEMALE \n", + "34 13.7 214.0 FEMALE \n", + "36 14.4 218.0 FEMALE \n", + "42 16.2 195.0 FEMALE \n", + "48 15.8 215.0 MALE \n", + "61 14.9 221.0 MALE \n", + "64 15.0 216.0 FEMALE \n", + "65 19.3 181.0 FEMALE \n", + "68 17.2 187.0 FEMALE \n", + "70 20.0 205.0 MALE \n", + "72 17.5 191.0 MALE \n", + "74 14.2 209.0 FEMALE \n", + "77 16.8 192.0 FEMALE \n", + "81 14.8 212.0 FEMALE \n", + "91 18.5 201.0 MALE \n", + "96 14.6 211.0 FEMALE \n", + "105 18.1 187.0 FEMALE \n", + "111 18.6 172.0 FEMALE \n", + "\n", + " species \n", + "penguin_id \n", + "1 Adelie Penguin (Pygoscelis adeliae) \n", + "4 Adelie Penguin (Pygoscelis adeliae) \n", + "8 Gentoo penguin (Pygoscelis papua) \n", + "11 Adelie Penguin (Pygoscelis adeliae) \n", + "13 Adelie Penguin (Pygoscelis adeliae) \n", + "15 Gentoo penguin (Pygoscelis papua) \n", + "16 Adelie Penguin (Pygoscelis adeliae) \n", + "23 Adelie Penguin (Pygoscelis adeliae) \n", + "34 Gentoo penguin (Pygoscelis papua) \n", + "36 Gentoo penguin (Pygoscelis papua) \n", + "42 Adelie Penguin (Pygoscelis adeliae) \n", + "48 Gentoo penguin (Pygoscelis papua) \n", + "61 Gentoo penguin (Pygoscelis papua) \n", + "64 Gentoo penguin (Pygoscelis papua) \n", + "65 Adelie Penguin (Pygoscelis adeliae) \n", + "68 Adelie Penguin (Pygoscelis adeliae) \n", + "70 Chinstrap penguin (Pygoscelis antarctica) \n", + "72 Chinstrap penguin (Pygoscelis antarctica) \n", + "74 Gentoo penguin (Pygoscelis papua) \n", + "77 Adelie Penguin (Pygoscelis adeliae) \n", + "81 Gentoo penguin (Pygoscelis papua) \n", + "91 Chinstrap penguin (Pygoscelis antarctica) \n", + "96 Gentoo penguin (Pygoscelis papua) \n", + "105 Adelie Penguin (Pygoscelis adeliae) \n", + "111 Adelie Penguin (Pygoscelis adeliae) \n", + "\n", + "[67 rows x 7 columns]" ] }, - "execution_count": 28, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -3034,60 +3670,16 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2d32081be31f44abb8de67e2209d76cd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 2a043039-670f-4eb8-9cf0-765ee6ed7de6 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job bc8b2042-1e13-441c-9531-300ed5badb7a is RUNNING. " ] }, "metadata": {}, @@ -3095,13 +3687,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4588ae10de634460bf4026ddd9076351", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 7f1f565b-0f73-4a4e-b33f-8484fa260838 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 5e867182-dd7a-4aff-87a8-f7596e900fd5 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -3109,13 +3699,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8209cf8286a545ebb7b6ef9d002a43a1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job d4b9d4a6-d75e-46e1-b092-ab58e8aef890 is DONE. 48 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d4cdb016-8f1e-4960-8ed7-4524ccc5a8a8 is RUNNING. " ] }, "metadata": {}, @@ -3153,12 +3741,12 @@ " \n", "25 rows × 6 columns
\n", "[146 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", - "0 Dream 36.6 18.4 184.0 3475.0 \n", - "1 Dream 39.8 19.1 184.0 4650.0 \n", - "2 Dream 40.9 18.9 184.0 3900.0 \n", - "4 Dream 37.3 16.8 192.0 3000.0 \n", - "5 Dream 43.2 18.5 192.0 4100.0 \n", - "9 Dream 40.2 20.1 200.0 3975.0 \n", - "10 Dream 40.8 18.9 208.0 4300.0 \n", - "11 Dream 39.0 18.7 185.0 3650.0 \n", - "12 Dream 37.0 16.9 185.0 3000.0 \n", - "14 Dream 34.0 17.1 185.0 3400.0 \n", - "15 Dream 37.0 16.5 185.0 3400.0 \n", - "18 Dream 39.7 17.9 193.0 4250.0 \n", - "19 Dream 37.8 18.1 193.0 3750.0 \n", - "22 Dream 40.2 17.1 193.0 3400.0 \n", - "23 Dream 36.8 18.5 193.0 3500.0 \n", - "26 Dream 41.5 18.5 201.0 4000.0 \n", - "31 Dream 33.1 16.1 178.0 2900.0 \n", - "32 Dream 37.2 18.1 178.0 3900.0 \n", - "33 Dream 39.5 16.7 178.0 3250.0 \n", - "35 Dream 36.0 18.5 186.0 3100.0 \n", - "36 Dream 39.6 18.1 186.0 4450.0 \n", - "38 Dream 41.3 20.3 194.0 3550.0 \n", - "41 Dream 35.7 18.0 202.0 3550.0 \n", - "51 Dream 38.1 17.6 187.0 3425.0 \n", - "53 Dream 36.0 17.1 187.0 3700.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "0 Biscoe 40.1 18.9 188.0 \n", + "1 Torgersen 39.1 18.7 181.0 \n", + "4 Biscoe 43.2 19.0 197.0 \n", + "6 Biscoe 41.3 21.1 195.0 \n", + "11 Dream 38.1 18.6 190.0 \n", + "13 Biscoe 37.8 20.0 190.0 \n", + "14 Biscoe 35.0 17.9 190.0 \n", + "16 Torgersen 34.6 21.1 198.0 \n", + "19 Dream 37.2 18.1 178.0 \n", + "21 Biscoe 40.5 17.9 187.0 \n", + "23 Dream 42.2 18.5 180.0 \n", + "30 Dream 39.2 21.1 196.0 \n", + "32 Torgersen 42.9 17.6 196.0 \n", + "38 Dream 41.1 17.5 190.0 \n", + "40 Torgersen 38.6 21.2 191.0 \n", + "42 Biscoe 35.5 16.2 195.0 \n", + "44 Dream 39.2 18.6 190.0 \n", + "45 Torgersen 35.2 15.9 186.0 \n", + "46 Dream 43.2 18.5 192.0 \n", + "49 Biscoe 39.6 17.7 186.0 \n", + "53 Biscoe 45.6 20.3 191.0 \n", + "58 Torgersen 40.9 16.8 191.0 \n", + "60 Torgersen 40.3 18.0 195.0 \n", + "62 Dream 36.0 18.5 186.0 \n", + "63 Torgersen 39.3 20.6 190.0 \n", "\n", - " sex \n", - "0 FEMALE \n", - "1 MALE \n", - "2 MALE \n", - "4 FEMALE \n", - "5 MALE \n", - "9 MALE \n", - "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "14 FEMALE \n", - "15 FEMALE \n", - "18 MALE \n", - "19 MALE \n", - "22 FEMALE \n", - "23 FEMALE \n", - "26 MALE \n", - "31 FEMALE \n", - "32 MALE \n", - "33 FEMALE \n", - "35 FEMALE \n", - "36 MALE \n", - "38 MALE \n", - "41 FEMALE \n", - "51 FEMALE \n", - "53 FEMALE \n", + " body_mass_g sex \n", + "0 4300.0 MALE \n", + "1 3750.0 MALE \n", + "4 4775.0 MALE \n", + "6 4400.0 MALE \n", + "11 3700.0 FEMALE \n", + "13 4250.0 MALE \n", + "14 3450.0 FEMALE \n", + "16 4400.0 MALE \n", + "19 3900.0 MALE \n", + "21 3200.0 FEMALE \n", + "23 3550.0 FEMALE \n", + "30 4150.0 MALE \n", + "32 4700.0 MALE \n", + "38 3900.0 MALE \n", + "40 3800.0 MALE \n", + "42 3350.0 FEMALE \n", + "44 4250.0 MALE \n", + "45 3050.0 FEMALE \n", + "46 4100.0 MALE \n", + "49 3500.0 FEMALE \n", + "53 4600.0 MALE \n", + "58 3700.0 FEMALE \n", + "60 3250.0 FEMALE \n", + "62 3100.0 FEMALE \n", + "63 3650.0 MALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -843,18 +793,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "81f9aa34c7234bd88b6b7a4bc77d4b4e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 0808457b-a0df-4a37-b7a5-8885f4a4588c is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 288f0daa-a51e-45b4-86bf-d054467c4a99 is DONE. 28.9 kB processed. " ] }, "metadata": {}, @@ -881,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -897,7 +845,7 @@ " ('linreg', LinearRegression(fit_intercept=False))])" ] }, - "execution_count": 15, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -936,9 +884,63 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job e9bfa6a5-a53f-4d8b-ae8c-cc8cd55d0947 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "3 rows × 1 columns
\n", - "[3 rows x 1 columns in total]" + "3 rows × 7 columns
\n", + "[3 rows x 7 columns in total]" ], "text/plain": [ - " predicted_body_mass_g\n", - "tag_number \n", - "1633 3965.994361\n", - "1672 3246.312058\n", - "1690 3456.404062\n", + " predicted_body_mass_g species \\\n", + "tag_number \n", + "1633 4017.203152 Adelie Penguin (Pygoscelis adeliae) \n", + "1672 3127.601519 Adelie Penguin (Pygoscelis adeliae) \n", + "1690 3386.101231 Adelie Penguin (Pygoscelis adeliae) \n", + "\n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "tag_number \n", + "1633 Torgersen 39.5 18.8 196.0 \n", + "1672 Torgersen 38.5 17.2 181.0 \n", + "1690 Dream 37.9 18.1 188.0 \n", "\n", - "[3 rows x 1 columns]" + " sex \n", + "tag_number \n", + "1633 MALE \n", + "1672 FEMALE \n", + "1690 FEMALE \n", + "\n", + "[3 rows x 7 columns]" ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1240,28 +1250,53 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Save in BigQuery" + "## 6. Save in BigQuery" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "Copy job d1def4a4-1da1-43a9-8ae5-4459444d993d is DONE. Open Job" + ], "text/plain": [ - "Pipeline(steps=[('preproc',\n", - " ColumnTransformer(transformers=[('onehot', OneHotEncoder(),\n", - " ['island', 'species', 'sex']),\n", - " ('scaler', StandardScaler(),\n", - " ['culmen_depth_mm',\n", - " 'culmen_length_mm',\n", - " 'flipper_length_mm'])])),\n", - " ('linreg', LinearRegression(fit_intercept=False))])" + "\n", - " | consumer_complaint_narrative | \n", - "
---|---|
0 | \n", - "I signed a contract as a condition of employme... | \n", - "
1 | \n", - "First, I want to disclose that XXXX and XXXX b... | \n", - "
2 | \n", - "Frequent calls from Focused Receivables Manage... | \n", - "
3 | \n", - "I recently contacted Enhanced Recovery Company... | \n", - "
4 | \n", - "This began when I subscribed to XXXX XXXX inte... | \n", - "
5 rows × 1 columns
\n", - "\n", - " | text_embedding | \n", - "
---|---|
422 | \n", - "[-0.012013785541057587, 0.003669967409223318, ... | \n", - "
616 | \n", - "[-0.014948881231248379, -0.04672442376613617, ... | \n", - "
833 | \n", - "[-0.01951478235423565, -0.027120858430862427, ... | \n", - "
1370 | \n", - "[-0.03140445053577423, -0.048797041177749634, ... | \n", - "
1430 | \n", - "[-0.02244548313319683, -0.03336532413959503, 0... | \n", - "
5 rows × 1 columns
\n", - "\n", - " | consumer_complaint_narrative | \n", - "text_embedding | \n", - "
---|---|---|
2580664 | \n", - "Hello, my name is XXXX XXXX, and I am writing ... | \n", - "[0.0003211698785889894, -0.01816680282354355, ... | \n", - "
1806973 | \n", - "This is XXXX XXXX and I am submitting this com... | \n", - "[-0.009485247544944286, -0.025846892967820168,... | \n", - "
2055053 | \n", - "XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... | \n", - "[-0.010950954630970955, -0.0249345600605011, 0... | \n", - "
2515231 | \n", - "When I reinvestigated my credit report, I real... | \n", - "[-0.009660656563937664, -0.05793113633990288, ... | \n", - "
2633049 | \n", - "Checking my credit report XX/XX/2018 with all ... | \n", - "[-0.0022159104701131582, -0.03330004960298538,... | \n", - "
3117273 | \n", - "I contacted TransUnion and spoke a credit rep ... | \n", - "[-0.015955328941345215, -0.006488671060651541,... | \n", - "
698814 | \n", - "XXXX XXXX XXXX. makes daily calls to me cell c... | \n", - "[0.005397460889071226, -0.01276913657784462, 0... | \n", - "
267826 | \n", - "Can we please reopen Case : XXXX? \n", - "\n", - "Wells Farg... | \n", - "[0.004065403249114752, -0.0005381882656365633,... | \n", - "
54019 | \n", - "My rights under 15 USC 1681 have been violated... | \n", - "[0.013823015615344048, -0.02010691538453102, 0... | \n", - "
141050 | \n", - "To whom it may concern : My personal informati... | \n", - "[0.008104532025754452, -0.01856449618935585, 0... | \n", - "
2962076 | \n", - "I have had a CashApp account since last year, ... | \n", - "[-0.0003019514260813594, -0.03750108182430267,... | \n", - "
2481105 | \n", - "that some of the information was erroneous. Th... | \n", - "[-0.014868081547319889, -0.0443895161151886, -... | \n", - "
431562 | \n", - "I have disputed the referenced accounts to the... | \n", - "[-0.0020524838473647833, -0.04830990731716156,... | \n", - "
1953029 | \n", - "On, XX/XX/22, I attempted to complete a transa... | \n", - "[-0.01599179394543171, -0.0074900356121361256,... | \n", - "
2395979 | \n", - "Subject : XXXX XXXX XXXX compensation, refund,... | \n", - "[-0.0035950862802565098, -0.014652969315648079... | \n", - "
455524 | \n", - "I paid off my mortgage on XX/XX/2019. The comp... | \n", - "[-0.01100730150938034, -0.03495829552412033, 0... | \n", - "
2155924 | \n", - "This kind of account is placed as a charged of... | \n", - "[-0.028635455295443535, -0.028604287654161453,... | \n", - "
1069497 | \n", - "This is one of many issues I have had with Wel... | \n", - "[0.008871790021657944, -0.028502725064754486, ... | \n", - "
3181689 | \n", - "I have disputed this account with MONTEREY FIN... | \n", - "[-0.004721717908978462, -0.03673810139298439, ... | \n", - "
274268 | \n", - "Lender is not updating my loan status in the V... | \n", - "[-0.009221495129168034, -0.0289347805082798, 0... | \n", - "
1671305 | \n", - "XXXX is a peer to peer lending conmpany that u... | \n", - "[-0.02911308966577053, -0.01850792020559311, -... | \n", - "
886026 | \n", - "( DISPUTE CODE - XXXX ) My personal informatio... | \n", - "[-0.007220877334475517, -0.016615957021713257,... | \n", - "
1044431 | \n", - "I filed a complaint against PNC this year and ... | \n", - "[0.002848619595170021, -0.035117778927087784, ... | \n", - "
1938481 | \n", - "I applied for a modification and was approved.... | \n", - "[-0.03114932030439377, -0.0421406552195549, 0.... | \n", - "
1987834 | \n", - "Ive been Disputting my XXXX XXXX I opened this... | \n", - "[-0.009406660683453083, -0.020967338234186172,... | \n", - "
25 rows × 2 columns
\n", - "\n", - " | CENTROID_ID | \n", - "
---|---|
422 | \n", - "2 | \n", - "
616 | \n", - "3 | \n", - "
833 | \n", - "5 | \n", - "
1370 | \n", - "7 | \n", - "
1430 | \n", - "3 | \n", - "
5 rows × 1 columns
\n", - "\n", - " | consumer_complaint_narrative | \n", - "text_embedding | \n", - "CENTROID_ID | \n", - "
---|---|---|---|
2580664 | \n", - "Hello, my name is XXXX XXXX, and I am writing ... | \n", - "[0.0003211698785889894, -0.01816680282354355, ... | \n", - "2 | \n", - "
1806973 | \n", - "This is XXXX XXXX and I am submitting this com... | \n", - "[-0.009485247544944286, -0.025846892967820168,... | \n", - "5 | \n", - "
2055053 | \n", - "XXXX XXXX XXXX, XXXX. ( address : XXXX XXXX XX... | \n", - "[-0.010950954630970955, -0.0249345600605011, 0... | \n", - "3 | \n", - "
2515231 | \n", - "When I reinvestigated my credit report, I real... | \n", - "[-0.009660656563937664, -0.05793113633990288, ... | \n", - "5 | \n", - "
2633049 | \n", - "Checking my credit report XX/XX/2018 with all ... | \n", - "[-0.0022159104701131582, -0.03330004960298538,... | \n", - "3 | \n", - "
5 rows × 3 columns
\n", - "