googleapis · tswast · Jul 30, 2024 · Jul 24, 2024 · Jul 25, 2024 · Jul 25, 2024
@@ -2292,11 +2292,11 @@ def join(
                 f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}"
             )
         # Handle null index, which only supports row join
-        if (self.index.nlevels == other.index.nlevels == 0) and not block_identity_join:
-            if not block_identity_join:
-                result = try_row_join(self, other, how=how)
-                if result is not None:
-                    return result
+        # This is the canonical way of aligning on null index, so always allow (ignore block_identity_join)
+        if self.index.nlevels == other.index.nlevels == 0:
+            result = try_row_join(self, other, how=how)
+            if result is not None:
+                return result
             raise bigframes.exceptions.NullIndexError(
                 "Cannot implicitly align objects. Set an explicit index using set_index."
             )

@@ -83,7 +83,7 @@ def distance(
         """
         assert len(x.columns) == 1 and len(y.columns) == 1
 
-        input_data = x.cache().join(y.cache(), how="outer")
+        input_data = x.join(y, how="outer").cache()
         x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0]
 
         return self._apply_sql(
@@ -326,7 +326,7 @@ def create_model(
         if y_train is None:
             input_data = X_train.cache()
         else:
-            input_data = X_train.cache().join(y_train.cache(), how="outer")
+            input_data = X_train.join(y_train, how="outer").cache()
             options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()})
 
         session = X_train._session
@@ -366,7 +366,7 @@ def create_llm_remote_model(
         options = dict(options)
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
-        input_data = X_train.cache().join(y_train.cache(), how="outer")
+        input_data = X_train.join(y_train, how="outer").cache()
         options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()})
 
         session = X_train._session
@@ -399,7 +399,7 @@ def create_time_series_model(
         options = dict(options)
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
-        input_data = X_train.cache().join(y_train.cache(), how="outer")
+        input_data = X_train.join(y_train, how="outer").cache()
         options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]})
         options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]})
 

@@ -111,6 +111,50 @@ def test_linear_regression_customized_params_fit_score(
     assert reloaded_model.learning_rate == 0.2
 
 
+def test_unordered_mode_regression_configure_fit_score(
+    unordered_session, penguins_table_id, dataset_id
+):
+    model = bigframes.ml.linear_model.LinearRegression()
+
+    df = unordered_session.read_gbq(penguins_table_id).dropna()
+    X_train = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+            "culmen_depth_mm",
+            "flipper_length_mm",
+            "sex",
+        ]
+    ]
+    y_train = df[["body_mass_g"]]
+    model.fit(X_train, y_train)
+
+    # Check score to ensure the model was fitted
+    result = model.score(X_train, y_train).to_pandas()
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
+    )
+
+    # save, load, check parameters to ensure configuration was kept
+    reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True)
+    assert reloaded_model._bqml_model is not None
+    assert (
+        f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name
+    )
+    assert reloaded_model.optimize_strategy == "NORMAL_EQUATION"
+    assert reloaded_model.fit_intercept is True
+    assert reloaded_model.calculate_p_values is False
+    assert reloaded_model.enable_global_explain is False
+    assert reloaded_model.l1_reg is None
+    assert reloaded_model.l2_reg == 0.0
+    assert reloaded_model.learning_rate is None
+    assert reloaded_model.learning_rate_strategy == "line_search"
+    assert reloaded_model.ls_init_learning_rate is None
+    assert reloaded_model.max_iterations == 20
+    assert reloaded_model.tol == 0.01
+
+
 # TODO(garrettwu): add tests for param warm_start. Requires a trained model.
 
 

@@ -201,6 +201,20 @@ def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index
     )
 
 
+def test_null_index_series_self_join(
+    scalars_df_null_index, scalars_pandas_df_default_index
+):
+    bf_result = scalars_df_null_index[["int64_col"]].join(
+        scalars_df_null_index[["int64_too"]]
+    )
+    pd_result = scalars_pandas_df_default_index[["int64_col"]].join(
+        scalars_pandas_df_default_index[["int64_too"]]
+    )
+    pd.testing.assert_frame_equal(
+        bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False
+    )
+
+
 def test_null_index_series_self_aligns(
     scalars_df_null_index, scalars_pandas_df_default_index
 ):

@@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session):
         ["index_column_label"],
     )
     mock_X.join(mock_y).sql = "input_X_y_sql"
+    mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y)
     mock_X.join(mock_y)._to_sql_query.return_value = (
         "input_X_y_sql",
         ["index_column_id"],