diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2d7c543678..29cbbb3b3b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2292,11 +2292,11 @@ def join( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) # Handle null index, which only supports row join - if (self.index.nlevels == other.index.nlevels == 0) and not block_identity_join: - if not block_identity_join: - result = try_row_join(self, other, how=how) - if result is not None: - return result + # This is the canonical way of aligning on null index, so always allow (ignore block_identity_join) + if self.index.nlevels == other.index.nlevels == 0: + result = try_row_join(self, other, how=how) + if result is not None: + return result raise bigframes.exceptions.NullIndexError( "Cannot implicitly align objects. Set an explicit index using set_index." ) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index ee4d8a8c27..f1b36651f4 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -83,7 +83,7 @@ def distance( """ assert len(x.columns) == 1 and len(y.columns) == 1 - input_data = x.cache().join(y.cache(), how="outer") + input_data = x.join(y, how="outer").cache() x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] return self._apply_sql( @@ -326,7 +326,7 @@ def create_model( if y_train is None: input_data = X_train.cache() else: - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -366,7 +366,7 @@ def create_llm_remote_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -399,7 +399,7 @@ def create_time_series_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 0cc9fc5353..2f4c07fa28 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -111,6 +111,50 @@ def test_linear_regression_customized_params_fit_score( assert reloaded_model.learning_rate == 0.2 +def test_unordered_mode_regression_configure_fit_score( + unordered_session, penguins_table_id, dataset_id +): + model = bigframes.ml.linear_model.LinearRegression() + + df = unordered_session.read_gbq(penguins_table_id).dropna() + X_train = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) + + # Check score to ensure the model was fitted + result = model.score(X_train, y_train).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is True + assert reloaded_model.calculate_p_values is False + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None + assert reloaded_model.max_iterations == 20 + assert reloaded_model.tol == 0.01 + + # TODO(garrettwu): add tests for param warm_start. Requires a trained model. diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index 27a3d8dffe..a1e360f73d 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -201,6 +201,20 @@ def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index ) +def test_null_index_series_self_join( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index[["int64_col"]].join( + scalars_df_null_index[["int64_too"]] + ) + pd_result = scalars_pandas_df_default_index[["int64_col"]].join( + scalars_pandas_df_default_index[["int64_too"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_null_index_series_self_aligns( scalars_df_null_index, scalars_pandas_df_default_index ): diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 48fb7011ea..aa7e919b24 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session): ["index_column_label"], ) mock_X.join(mock_y).sql = "input_X_y_sql" + mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) mock_X.join(mock_y)._to_sql_query.return_value = ( "input_X_y_sql", ["index_column_id"],