From ff31b7e9d20489efb122fff47e32091d446f717d Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 19 Mar 2025 22:16:51 +0000 Subject: [PATCH 1/8] refactor: introduce ai operator namespace and deprecated semantics --- bigframes/dataframe.py | 9 + bigframes/operations/ai.py | 898 +++++++++++++++++++++ tests/system/large/operations/test_ai.py | 956 +++++++++++++++++++++++ tests/system/small/operations/test_ai.py | 122 +++ tests/unit/test_dataframe.py | 11 + 5 files changed, 1996 insertions(+) create mode 100644 bigframes/operations/ai.py create mode 100644 tests/system/large/operations/test_ai.py create mode 100644 tests/system/small/operations/test_ai.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index abab9fd268..e6f97ea27f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -73,6 +73,7 @@ import bigframes.operations as ops import bigframes.operations.aggregations import bigframes.operations.aggregations as agg_ops +import bigframes.operations.ai import bigframes.operations.plotting as plotting import bigframes.operations.semantics import bigframes.operations.structs @@ -4518,4 +4519,12 @@ def _throw_if_null_index(self, opname: str): @property def semantics(self): + warnings.warn( + "The 'semantic' property will be removed. Please use 'ai' instead.", + FutureWarning, + ) return bigframes.operations.semantics.Semantics(self) + + @property + def ai(self): + return bigframes.operations.ai.Ai(self) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py new file mode 100644 index 0000000000..f54066f1e3 --- /dev/null +++ b/bigframes/operations/ai.py @@ -0,0 +1,898 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import typing +from typing import List, Optional +import warnings + +import numpy as np + +from bigframes import dtypes, exceptions +from bigframes.core import guid, log_adapter + + +@log_adapter.class_logger +class Ai: + def __init__(self, df) -> None: + import bigframes # Import in the function body to avoid circular imports. + import bigframes.dataframe + + if not bigframes.options.experiments.semantic_operators: + raise NotImplementedError() + + self._df: bigframes.dataframe.DataFrame = df + + def filter(self, instruction: str, model, ground_with_google_search: bool = False): + """ + Filters the DataFrame with the semantics of the user instruction. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) + >>> df.ai.filter("{city} is the capital of {country}", model) + country city + 1 Germany Berlin + + [1 rows x 2 columns] + + Args: + instruction (str): + An instruction on how to filter the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "food", you can refer to this column + in the instructions like: + "The {food} is healthy." + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.pandas.DataFrame: DataFrame filtered by the instruction. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + self._confirm_operation(len(self._df)) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + has_blob_column = False + for column in columns: + if df[column].dtype == dtypes.OBJ_REF_DTYPE: + # Don't cast blob columns to string + has_blob_column = True + continue + + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + user_instruction = self._format_instruction(instruction, columns) + output_instruction = "Based on the provided context, reply to the following claim by only True or False:" + + if has_blob_column: + results = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + df, + prompt=self._make_multimodel_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + else: + results = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + self._make_text_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + + return self._df[ + results["ml_generate_text_llm_result"].str.lower().str.contains("true") + ] + + def map( + self, + instruction: str, + output_column: str, + model, + ground_with_google_search: bool = False, + ): + """ + Maps the DataFrame with the semantics of the user instruction. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) + >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + ingredient_1 ingredient_2 food + 0 Burger Bun Beef Patty Burger + + 1 Soy Bean Bittern Tofu + + + [2 rows x 3 columns] + + Args: + instruction (str): + An instruction on how to map the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "food", you can refer to this column + in the instructions like: + "Get the ingredients of {food}." + + output_column (str): + The column name of the mapping result. + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.pandas.DataFrame: DataFrame with attached mapping results. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + self._confirm_operation(len(self._df)) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + has_blob_column = False + for column in columns: + if df[column].dtype == dtypes.OBJ_REF_DTYPE: + # Don't cast blob columns to string + has_blob_column = True + continue + + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + user_instruction = self._format_instruction(instruction, columns) + output_instruction = ( + "Based on the provided contenxt, answer the following instruction:" + ) + + if has_blob_column: + results = typing.cast( + bigframes.series.Series, + model.predict( + df, + prompt=self._make_multimodel_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + )["ml_generate_text_llm_result"], + ) + else: + results = typing.cast( + bigframes.series.Series, + model.predict( + self._make_text_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + )["ml_generate_text_llm_result"], + ) + + from bigframes.core.reshape.api import concat + + return concat([self._df, results.rename(output_column)], axis=1) + + def join( + self, + other, + instruction: str, + model, + ground_with_google_search: bool = False, + ): + """ + Joines two dataframes by applying the instruction over each pair of rows from + the left and right table. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) + >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) + + >>> cities.ai.join(continents, "{city} is in {continent}", model) + city continent + 0 Seattle North America + 1 Ottawa North America + 2 Shanghai Asia + 3 New Delhi Asia + + [4 rows x 2 columns] + + Args: + other (bigframes.pandas.DataFrame): + The other dataframe. + + instruction (str): + An instruction on how left and right rows can be joined. This value must contain + column references by name. which should be wrapped in a pair of braces. + For example: "The {city} belongs to the {country}". + For column names that are shared between two dataframes, you need to add "left." + and "right." prefix for differentiation. This is especially important when you do + self joins. For example: "The {left.employee_name} reports to {right.employee_name}" + For unique column names, this prefix is optional. + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + max_rows (int, default 1000): + The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method + call will end early with an error. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.pandas.DataFrame: The joined dataframe. + + Raises: + ValueError if the amount of data that will be sent for LLM processing is larger than max_rows. + """ + self._validate_model(model) + columns = self._parse_columns(instruction) + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) + + left_columns = [] + right_columns = [] + + for col in columns: + if col in self._df.columns and col in other.columns: + raise ValueError(f"Ambiguous column reference: {col}") + + elif col in self._df.columns: + left_columns.append(col) + + elif col in other.columns: + right_columns.append(col) + + elif col.startswith("left."): + original_col_name = col[len("left.") :] + if ( + original_col_name in self._df.columns + and original_col_name in other.columns + ): + left_columns.append(col) + elif original_col_name in self._df.columns: + left_columns.append(col) + instruction = instruction.replace(col, original_col_name) + else: + raise ValueError(f"Column {col} not found") + + elif col.startswith("right."): + original_col_name = col[len("right.") :] + if ( + original_col_name in self._df.columns + and original_col_name in other.columns + ): + right_columns.append(col) + elif original_col_name in other.columns: + right_columns.append(col) + instruction = instruction.replace(col, original_col_name) + else: + raise ValueError(f"Column {col} not found") + + else: + raise ValueError(f"Column {col} not found") + + if not left_columns: + raise ValueError("No left column references.") + + if not right_columns: + raise ValueError("No right column references.") + + # Update column references to be compatible with internal naming scheme. + # That is, "left.col" -> "col_left" and "right.col" -> "col_right" + instruction = re.sub(r"(?>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import bigframes + >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + + >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) + >>> df.ai.search("creatures", "monkey", top_k=1, model=model, score_column='distance') + creatures distance + 3 chimpanzee 0.635844 + + [1 rows x 2 columns] + + Args: + search_column: + The name of the column to search from. + query (str): + The search query. + top_k (int): + The number of nearest neighbors to return. + model (TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + score_column (Optional[str], default None): + The name of the the additional column containning the similarity scores. If None, + this column won't be attached to the result. + + Returns: + DataFrame: the DataFrame with the search result. + + Raises: + ValueError: when the search_column is not found from the the data frame. + TypeError: when the provided model is not TextEmbeddingGenerator. + """ + + if search_column not in self._df.columns: + raise ValueError(f"Column `{search_column}` not found") + + self._confirm_operation(len(self._df)) + + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + if top_k < 1: + raise ValueError("top_k must be an integer greater than or equal to 1.") + + embedded_df = model.predict(self._df[search_column]) + embedded_table = embedded_df.reset_index().to_gbq() + + import bigframes.pandas as bpd + + embedding_result_column = "ml_generate_embedding_result" + query_df = model.predict(bpd.DataFrame({"query_id": [query]})).rename( + columns={"content": "query_id", embedding_result_column: "embedding"} + ) + + import bigframes.bigquery as bbq + + search_result = ( + bbq.vector_search( + base_table=embedded_table, + column_to_search=embedding_result_column, + query=query_df, + top_k=top_k, + ) + .rename(columns={"content": search_column}) + .set_index("index") + ) + + search_result.index.name = self._df.index.name + + if score_column is not None: + search_result = search_result.rename(columns={"distance": score_column})[ + [search_column, score_column] + ] + else: + search_result = search_result[[search_column]] + + import bigframes.dataframe + + return typing.cast(bigframes.dataframe.DataFrame, search_result) + + def top_k( + self, + instruction: str, + model, + k: int = 10, + ground_with_google_search: bool = False, + ): + """ + Ranks each tuple and returns the k best according to the instruction. + + This method employs a quick select algorithm to efficiently compare the pivot + with all other items. By leveraging an LLM (Large Language Model), it then + identifies the top 'k' best answers from these comparisons. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"Animals": ["Dog", "Bird", "Cat", "Horse"]}) + >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2) + Animals + 0 Dog + 2 Cat + + [2 rows x 1 columns] + + Args: + instruction (str): + An instruction on how to map the data. This value must contain + column references by name enclosed in braces. + For example, to reference a column named "Animals", use "{Animals}" in the + instruction, like: "{Animals} are more popular as pets" + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by the Bigframes ML package. + + k (int, default 10): + The number of rows to return. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + Returns: + bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + if len(columns) > 1: + raise NotImplementedError( + "Semantic aggregations are limited to a single column." + ) + + if ground_with_google_search: + msg = exceptions.format_message( + "Enables Grounding with Google Search may impact billing cost. See pricing " + "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" + ) + warnings.warn(msg, category=UserWarning) + + work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) + self._confirm_operation(work_estimate) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + column = columns[0] + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + # `index` is reserved for the `reset_index` below. + if column == "index": + raise ValueError( + "Column name 'index' is reserved. Please choose a different name." + ) + + if k < 1: + raise ValueError("k must be an integer greater than or equal to 1.") + + user_instruction = self._format_instruction(instruction, columns) + + n = df.shape[0] + if k >= n: + return df + + # Create a unique index and duplicate it as the "index" column. This workaround + # is needed for the select search algorithm due to unimplemented bigFrame methods. + df = df.reset_index().rename(columns={"index": "old_index"}).reset_index() + + # Initialize a status column to track the selection status of each item. + # - None: Unknown/not yet processed + # - 1.0: Selected as part of the top-k items + # - -1.0: Excluded from the top-k items + status_column = guid.generate_guid("status") + df[status_column] = bigframes.series.Series(None, dtype=dtypes.FLOAT_DTYPE) + + num_selected = 0 + while num_selected < k: + df, num_new_selected = self._topk_partition( + df, + column, + status_column, + user_instruction, + model, + k - num_selected, + ground_with_google_search, + ) + num_selected += num_new_selected + + df = ( + df[df[status_column] > 0] + .drop(["index", status_column], axis=1) + .rename(columns={"old_index": "index"}) + .set_index("index") + ) + df.index.name = None + return df + + @staticmethod + def _topk_partition( + df, + column: str, + status_column: str, + user_instruction: str, + model, + k: int, + ground_with_google_search: bool, + ): + output_instruction = ( + "Given a question and two documents, choose the document that best answers " + "the question. Respond with 'Document 1' or 'Document 2'. You must choose " + "one, even if neither is ideal. " + ) + + # Random pivot selection for improved average quickselect performance. + pending_df = df[df[status_column].isna()] + pivot_iloc = np.random.randint(0, pending_df.shape[0]) + pivot_index = pending_df.iloc[pivot_iloc]["index"] + pivot_df = pending_df[pending_df["index"] == pivot_index] + + # Build a prompt to compare the pivot item's relevance to other pending items. + prompt_s = pending_df[pending_df["index"] != pivot_index][column] + prompt_s = ( + f"{output_instruction}\n\nQuestion: {user_instruction}\n" + + f"\nDocument 1: {column} " + + pivot_df.iloc[0][column] + + f"\nDocument 2: {column} " + + prompt_s # type:ignore + ) + + import bigframes.dataframe + + predict_df = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + prompt_s, + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + + marks = predict_df["ml_generate_text_llm_result"].str.contains("2") + more_relavant: bigframes.dataframe.DataFrame = df[marks] + less_relavent: bigframes.dataframe.DataFrame = df[~marks] + + num_more_relavant = more_relavant.shape[0] + if k < num_more_relavant: + less_relavent[status_column] = -1.0 + pivot_df[status_column] = -1.0 + df = df.combine_first(less_relavent).combine_first(pivot_df) + return df, 0 + else: # k >= num_more_relavant + more_relavant[status_column] = 1.0 + df = df.combine_first(more_relavant) + if k >= num_more_relavant + 1: + pivot_df[status_column] = 1.0 + df = df.combine_first(pivot_df) + return df, num_more_relavant + 1 + else: + return df, num_more_relavant + + def sim_join( + self, + other, + left_on: str, + right_on: str, + model, + top_k: int = 3, + score_column: Optional[str] = None, + max_rows: int = 1000, + ): + """ + Joins two dataframes based on the similarity of the specified columns. + + This method uses BigQuery's VECTOR_SEARCH function to match rows on the left side with the rows that have + nearest embedding vectors on the right. In the worst case scenario, the complexity is around O(M * N * log K). + Therefore, this is a potentially expensive operation. + + ** Examples: ** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") + + >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) + >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) + + >>> df1.ai.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + animal animal_1 + 0 monkey baboon + 1 spider scorpion + + [2 rows x 2 columns] + + Args: + other (DataFrame): + The other data frame to join with. + left_on (str): + The name of the column on left side for the join. + right_on (str): + The name of the column on the right side for the join. + top_k (int, default 3): + The number of nearest neighbors to return. + model (TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + score_column (Optional[str], default None): + The name of the the additional column containning the similarity scores. If None, + this column won't be attached to the result. + max_rows: + The maximum number of rows allowed to be processed per call. If the result is too large, the method + call will end early with an error. + + Returns: + DataFrame: the data frame with the join result. + + Raises: + ValueError: when the amount of data to be processed exceeds the specified max_rows. + """ + + if left_on not in self._df.columns: + raise ValueError(f"Left column {left_on} not found") + if right_on not in self._df.columns: + raise ValueError(f"Right column {right_on} not found") + + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + joined_table_rows = len(self._df) * len(other) + if joined_table_rows > max_rows: + raise ValueError( + f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." + ) + + if top_k < 1: + raise ValueError("top_k must be an integer greater than or equal to 1.") + + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) + + base_table_embedding_column = guid.generate_guid() + base_table = self._attach_embedding( + other, right_on, base_table_embedding_column, model + ).to_gbq() + query_table = self._attach_embedding(self._df, left_on, "embedding", model) + + import bigframes.bigquery as bbq + + join_result = bbq.vector_search( + base_table=base_table, + column_to_search=base_table_embedding_column, + query=query_table, + top_k=top_k, + ) + + join_result = join_result.drop( + ["embedding", base_table_embedding_column], axis=1 + ) + + if score_column is not None: + join_result = join_result.rename(columns={"distance": score_column}) + else: + del join_result["distance"] + + return join_result + + @staticmethod + def _attach_embedding(dataframe, source_column: str, embedding_column: str, model): + result_df = dataframe.copy() + embeddings = model.predict(dataframe[source_column])[ + "ml_generate_embedding_result" + ] + result_df[embedding_column] = embeddings + return result_df + + @staticmethod + def _make_multimodel_prompt( + prompt_df, columns, user_instruction: str, output_instruction: str + ): + prompt = [f"{output_instruction}\n{user_instruction}\nContext: "] + for col in columns: + prompt.extend([f"{col} is ", prompt_df[col]]) + + return prompt + + @staticmethod + def _make_text_prompt( + prompt_df, columns, user_instruction: str, output_instruction: str + ): + prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: " + + # Combine context from multiple columns. + for col in columns: + prompt_df["prompt"] += f"{col} is `" + prompt_df[col] + "`\n" + + return prompt_df["prompt"] + + @staticmethod + def _parse_columns(instruction: str) -> List[str]: + """Extracts column names enclosed in curly braces from the user instruction. + For example, _parse_columns("{city} is in {continent}") == ["city", "continent"] + """ + columns = re.findall(r"(? str: + """Extracts column names enclosed in curly braces from the user instruction. + For example, `_format_instruction(["city", "continent"], "{city} is in {continent}") + == "city is in continent"` + """ + return instruction.format(**{col: col for col in columns}) + + @staticmethod + def _validate_model(model): + from bigframes.ml.llm import GeminiTextGenerator + + if not isinstance(model, GeminiTextGenerator): + raise TypeError("Model is not GeminiText Generator") + + @staticmethod + def _confirm_operation(row_count: int): + """Raises OperationAbortedError when the confirmation fails""" + import bigframes # Import in the function body to avoid circular imports. + + threshold = bigframes.options.compute.semantic_ops_confirmation_threshold + + if threshold is None or row_count <= threshold: + return + + if bigframes.options.compute.semantic_ops_threshold_autofail: + raise exceptions.OperationAbortedError( + f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows." + ) + + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. + print(f"This operation will process about {row_count} rows.") + print( + "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." + ) + print("Proceed? [Y/n]") + reply = input().casefold() + if reply not in {"y", "yes", ""}: + raise exceptions.OperationAbortedError("Operation was cancelled.") diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py new file mode 100644 index 0000000000..01966533f8 --- /dev/null +++ b/tests/system/large/operations/test_ai.py @@ -0,0 +1,956 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from contextlib import nullcontext +from unittest.mock import patch + +import pandas as pd +import pandas.testing +import pytest + +import bigframes +from bigframes import dataframe, exceptions, series + +SEM_OP_EXP_OPTION = "experiments.semantic_operators" +BLOB_EXP_OPTION = "experiments.blob" +THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" + + +def test_ai_experiment_off_raise_error(): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with bigframes.option_context(SEM_OP_EXP_OPTION, False), pytest.raises( + NotImplementedError + ): + df.ai + + +def test_filter(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ).to_pandas() + + expected_df = pd.DataFrame( + {"country": ["Germany"], "city": ["Berlin"], "year": [2024]}, index=[1] + ) + pandas.testing.assert_frame_equal( + actual_df, expected_df, check_dtype=False, check_index_type=False + ) + + +def test_filter_multi_model(session, gemini_flash_model): + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + BLOB_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + df["prey"] = series.Series( + ["building", "cross road", "rock", "squirrel", "rabbit"], session=session + ) + result = df.ai.filter( + "The object in {image} feeds on {prey}", + gemini_flash_model, + ).to_pandas() + + assert len(result) <= len(df) + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.ai.filter("{city} is the capital of {country} in {year}", gemini_flash_model) + + +def test_filter_single_column_reference(session, gemini_flash_model): + df = dataframe.DataFrame( + data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{country} is in Europe", gemini_flash_model + ).to_pandas() + + expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) + pandas.testing.assert_frame_equal( + actual_df, expected_df, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{city} is in the {non_existing_column}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{id}", + id="invalid_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): + df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.filter(instruction, gemini_flash_model) + + +def test_filter_invalid_model_raise_error(): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df.ai.filter("{city} is the capital of {country}", None) + + +def test_map(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ).to_pandas() + # Result sanitation + actual_df["food"] = actual_df["food"].str.strip().str.lower() + + expected_df = pd.DataFrame( + { + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + "food": ["burger", "tofu"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +def test_map_multimodel(session, gemini_flash_model): + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + BLOB_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + df["scenario"] = series.Series( + ["building", "cross road", "tree", "squirrel", "rabbit"], session=session + ) + result = df.ai.map( + "What is the object in {image} combined with {scenario}? One word only.", + "object", + gemini_flash_model, + ).to_pandas() + + assert len(result) == len(df) + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "What is the food made from {ingredient_1} and {non_existing_column}?}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{id}", + id="invalid_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "id": [1, 2], + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + } + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.map(instruction, "food", gemini_flash_model) + + +def test_map_invalid_model_raise_error(): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + }, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df.ai.map( + "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + None, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param("{city} is in {country}", id="no_dataframe_reference"), + pytest.param("{left.city} is in {country}", id="has_left_dataframe_reference"), + pytest.param( + "{city} is in {right.country}", + id="has_right_dataframe_reference", + ), + pytest.param( + "{left.city} is in {right.country}", id="has_both_dataframe_references" + ), + ], +) +def test_join(instruction, session, gemini_flash_model): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.ai.join( + countries, + instruction, + gemini_flash_model, + ).to_pandas() + + expected_df = pd.DataFrame( + { + "city": ["Seattle", "Berlin"], + "country": ["USA", "Germany"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + cities.ai.join( + countries, + "{city} is in {country}", + gemini_flash_model, + ) + + +def test_self_join(session, gemini_flash_model): + animals = dataframe.DataFrame( + data={ + "animal": ["spider", "capybara"], + }, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = animals.ai.join( + animals, + "{left.animal} is heavier than {right.animal}", + gemini_flash_model, + ).to_pandas() + + expected_df = pd.DataFrame( + { + "animal_left": ["capybara"], + "animal_right": ["spider"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + ("instruction", "error_pattern"), + [ + ("No column reference", "No column references"), + pytest.param( + "{city} is in {continent}", r"Column .+ not found", id="non_existing_column" + ), + pytest.param( + "{city} is in {country}", + r"Ambiguous column reference: .+", + id="ambiguous_column", + ), + pytest.param( + "{right.city} is in {country}", r"Column .+ not found", id="wrong_prefix" + ), + pytest.param( + "{city} is in {right.continent}", + r"Column .+ not found", + id="prefix_on_non_existing_column", + ), + ], +) +def test_join_invalid_instruction_raise_error( + instruction, error_pattern, gemini_flash_model +): + df1 = dataframe.DataFrame( + {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} + ) + df2 = dataframe.DataFrame( + { + "country": ["USA", "UK", "Germany"], + "region": ["North America", "Europe", "Europe"], + } + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError, match=error_pattern): + df1.ai.join(df2, instruction, gemini_flash_model) + + +def test_join_invalid_model_raise_error(): + cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) + countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + cities.ai.join(countries, "{city} is in {country}", None) + + +@pytest.mark.parametrize( + "score_column", + [ + pytest.param(None, id="no_score_column"), + pytest.param("distance", id="has_score_column"), + ], +) +def test_search(session, text_embedding_generator, score_column): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df.ai.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + score_column=score_column, + ).to_pandas() + + expected_result = pd.Series( + ["baboons", "chimpanzee"], index=[2, 4], name="creatures" + ) + pandas.testing.assert_series_equal( + actual_result["creatures"], + expected_result, + check_dtype=False, + check_index_type=False, + ) + + if score_column is None: + assert len(actual_result.columns) == 1 + else: + assert score_column in actual_result.columns + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_search_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.ai.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + ) + + +def test_search_invalid_column_raises_error(session, text_embedding_generator): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.search("whatever", "monkey", top_k=2, model=text_embedding_generator) + + +def test_search_invalid_model_raises_error(session): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df.ai.search("creatures", "monkey", top_k=2, model=None) + + +def test_search_invalid_top_k_raises_error(session, text_embedding_generator): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.search("creatures", "monkey", top_k=0, model=text_embedding_generator) + + +@pytest.mark.parametrize( + "score_column", + [ + pytest.param(None, id="no_score_column"), + pytest.param("distance", id="has_score_column"), + ], +) +def test_sim_join(session, text_embedding_generator, score_column): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + score_column=score_column, + ).to_pandas() + + expected_result = pd.DataFrame( + {"creatures": ["salmon", "cat"], "creatures_1": ["tuna", "dog"]} + ) + pandas.testing.assert_frame_equal( + actual_result[["creatures", "creatures_1"]], + expected_result, + check_dtype=False, + check_index_type=False, + ) + + if score_column is None: + assert len(actual_result.columns) == 2 + else: + assert score_column in actual_result.columns + + +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_sim_join_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + ) + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + pytest.param("whatever", "creatures", id="incorrect_left_column"), + pytest.param("creatures", "whatever", id="incorrect_right_column"), + ], +) +def test_sim_join_invalid_column_raises_error( + session, text_embedding_generator, left_on, right_on +): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df1.ai.sim_join( + df2, left_on=left_on, right_on=right_on, model=text_embedding_generator + ) + + +def test_sim_join_invalid_model_raises_error(session): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): + df1.ai.sim_join(df2, left_on="creatures", right_on="creatures", model=None) + + +def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + top_k=0, + model=text_embedding_generator, + ) + + +def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df1.ai.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + max_rows=1, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{Animals}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{Animals} and {Animals}", + id="two_columns", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + "{index}", + id="preserved", + marks=pytest.mark.xfail(raises=ValueError), + ), + ], +) +def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): + df = dataframe.DataFrame( + { + "Animals": ["Dog", "Cat", "Bird", "Horse"], + "ID": [1, 2, 3, 4], + "index": ["a", "b", "c", "d"], + } + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.ai.top_k(instruction, model=gemini_flash_model, k=2) + + +def test_top_k_invalid_k_raise_error(gemini_flash_model): + df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): + df.ai.top_k( + "{Animals} are more popular as pets", + gemini_flash_model, + k=0, + ) + + +@patch("builtins.input", return_value="") +def test_confirm_operation__below_threshold_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 3, + ): + df.ai._confirm_operation(1) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + None, + ): + df.ai._confirm_operation(100) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 1, + "compute.semantic_ops_threshold_autofail", + True, + ), pytest.raises(exceptions.OperationAbortedError): + df.ai._confirm_operation(100) + + mock_input.assert_not_called() + + +@pytest.mark.parametrize( + ("reply", "expectation"), + [ + ("y", nullcontext()), + ("yes", nullcontext()), + ("", nullcontext()), + ("n", pytest.raises(exceptions.OperationAbortedError)), + ("something", pytest.raises(exceptions.OperationAbortedError)), + ], +) +def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypatch): + monkeypatch.setattr("builtins.input", lambda: reply) + df = dataframe.DataFrame({}) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 3, + ), expectation as e: + assert df.ai._confirm_operation(4) == e diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py new file mode 100644 index 0000000000..7d3c14314c --- /dev/null +++ b/tests/system/small/operations/test_ai.py @@ -0,0 +1,122 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Note that the tests in this files uses fake models for deterministic results. +# Tests that use real LLM models are under system/large/test_semantcs.py + +import pandas as pd +import pandas.testing +import pytest + +import bigframes +from bigframes import dataframe, dtypes +from bigframes.ml import llm + +SEM_OP_EXP_OPTION = "experiments.semantic_operators" +THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" + + +class FakeGeminiTextGenerator(llm.GeminiTextGenerator): + def __init__(self, prediction): + self.prediction = prediction + + def predict(self, *args, **kwargs): + return self.prediction + + +def test_experiment_off_raise_error(session): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session + ) + + with bigframes.option_context(SEM_OP_EXP_OPTION, False), pytest.raises( + NotImplementedError + ): + df.ai + + +def test_filter(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["true", "false"]}, session=session + ), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.filter( + "filter {col}", + model=model, + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame({"col": ["A"]}, dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) + + +def test_map(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["true", "false"]}, session=session + ), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.map("map {col}", model=model, output_column="output").to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + {"col": ["A", "B"], "output": ["true", "false"]}, dtype=dtypes.STRING_DTYPE + ), + check_index_type=False, + ) + + +def test_join(session): + left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) + right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame({"ml_generate_text_llm_result": ["true"]}, session=session), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = left_df.ai.join( + right_df, "join {col_A} and {col_B}", model + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame({"col_A": ["A"], "col_B": ["B"]}, dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index a6ad5e3821..8d1e77510a 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -89,3 +89,14 @@ def test_dataframe_to_gbq_writes_to_anonymous_dataset( destination = dataframe.to_gbq() assert destination.startswith(anonymous_dataset_id) + + +def test_dataframe_semantics_property_future_warning( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with bigframes.option_context("experiments.semantic_operators", True), pytest.warns( + FutureWarning + ): + dataframe.semantics From df640c2c9ee843f32c38583e67f63c1eee515a92 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 19 Mar 2025 23:21:20 +0000 Subject: [PATCH 2/8] duplicate semantic op options too --- bigframes/_config/compute_options.py | 3 + bigframes/_config/experiment_options.py | 15 +++++ bigframes/operations/ai.py | 6 +- tests/system/large/operations/test_ai.py | 72 ++++++++++++------------ tests/system/small/operations/test_ai.py | 12 ++-- 5 files changed, 63 insertions(+), 45 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 21b41eb185..31ed4a2e4d 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -84,6 +84,9 @@ class ComputeOptions: semantic_ops_confirmation_threshold: Optional[int] = 0 semantic_ops_threshold_autofail = False + ai_ops_confirmation_threshold: Optional[int] = 0 + ai_ops_threshold_autofail = False + def assign_extra_query_labels(self, **kwargs: Any) -> None: """ Assigns additional custom labels for query configuration. The method updates the diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 3d52976004..6403f104a6 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -24,6 +24,7 @@ class ExperimentOptions: def __init__(self): self._semantic_operators: bool = False + self._ai_operators: bool = False self._blob: bool = False self._udf: bool = False @@ -41,6 +42,20 @@ def semantic_operators(self, value: bool): warnings.warn(msg, category=bfe.PreviewWarning) self._semantic_operators = value + @property + def ai_operators(self) -> bool: + return self._ai_operators + + @ai_operators.setter + def ai_operators(self, value: bool): + if value is True: + msg = bfe.format_message( + "AI operators are still under experiments, and are subject " + "to change in the future." + ) + warnings.warn(msg, category=bfe.PreviewWarning) + self._ai_operators = value + @property def blob(self) -> bool: return self._blob diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index f54066f1e3..aef0d60b28 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -29,7 +29,7 @@ def __init__(self, df) -> None: import bigframes # Import in the function body to avoid circular imports. import bigframes.dataframe - if not bigframes.options.experiments.semantic_operators: + if not bigframes.options.experiments.ai_operators: raise NotImplementedError() self._df: bigframes.dataframe.DataFrame = df @@ -876,12 +876,12 @@ def _confirm_operation(row_count: int): """Raises OperationAbortedError when the confirmation fails""" import bigframes # Import in the function body to avoid circular imports. - threshold = bigframes.options.compute.semantic_ops_confirmation_threshold + threshold = bigframes.options.compute.ai_ops_confirmation_threshold if threshold is None or row_count <= threshold: return - if bigframes.options.compute.semantic_ops_threshold_autofail: + if bigframes.options.compute.ai_ops_threshold_autofail: raise exceptions.OperationAbortedError( f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows." ) diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index 01966533f8..2284be9238 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -22,9 +22,9 @@ import bigframes from bigframes import dataframe, exceptions, series -SEM_OP_EXP_OPTION = "experiments.semantic_operators" +AI_OP_EXP_OPTION = "experiments.ai_operators" BLOB_EXP_OPTION = "experiments.blob" -THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" +THRESHOLD_OPTION = "compute.ai_ops_confirmation_threshold" def test_ai_experiment_off_raise_error(): @@ -32,7 +32,7 @@ def test_ai_experiment_off_raise_error(): {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with bigframes.option_context(SEM_OP_EXP_OPTION, False), pytest.raises( + with bigframes.option_context(AI_OP_EXP_OPTION, False), pytest.raises( NotImplementedError ): df.ai @@ -49,7 +49,7 @@ def test_filter(session, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -68,7 +68,7 @@ def test_filter(session, gemini_flash_model): def test_filter_multi_model(session, gemini_flash_model): with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, BLOB_EXP_OPTION, True, @@ -110,7 +110,7 @@ def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatc monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -125,7 +125,7 @@ def test_filter_single_column_reference(session, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -164,7 +164,7 @@ def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model) df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -178,7 +178,7 @@ def test_filter_invalid_model_raise_error(): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -197,7 +197,7 @@ def test_map(session, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -229,7 +229,7 @@ def test_map(session, gemini_flash_model): def test_map_multimodel(session, gemini_flash_model): with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, BLOB_EXP_OPTION, True, @@ -272,7 +272,7 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -314,7 +314,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -331,7 +331,7 @@ def test_map_invalid_model_raise_error(): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -370,7 +370,7 @@ def test_join(instruction, session, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -419,7 +419,7 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -440,7 +440,7 @@ def test_self_join(session, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -502,7 +502,7 @@ def test_join_invalid_instruction_raise_error( ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -515,7 +515,7 @@ def test_join_invalid_model_raise_error(): countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -537,7 +537,7 @@ def test_search(session, text_embedding_generator, score_column): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -585,7 +585,7 @@ def test_search_with_confirmation( monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -605,7 +605,7 @@ def test_search_invalid_column_raises_error(session, text_embedding_generator): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -620,7 +620,7 @@ def test_search_invalid_model_raises_error(session): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -635,7 +635,7 @@ def test_search_invalid_top_k_raises_error(session, text_embedding_generator): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -661,7 +661,7 @@ def test_sim_join(session, text_embedding_generator, score_column): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -714,7 +714,7 @@ def test_sim_join_with_confirmation( monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -748,7 +748,7 @@ def test_sim_join_invalid_column_raises_error( ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -769,7 +769,7 @@ def test_sim_join_invalid_model_raises_error(session): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -788,7 +788,7 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -813,7 +813,7 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -862,7 +862,7 @@ def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -874,7 +874,7 @@ def test_top_k_invalid_k_raise_error(gemini_flash_model): df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -891,7 +891,7 @@ def test_confirm_operation__below_threshold_do_not_confirm(mock_input): df = dataframe.DataFrame({}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 3, @@ -906,7 +906,7 @@ def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): df = dataframe.DataFrame({}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, None, @@ -921,7 +921,7 @@ def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): df = dataframe.DataFrame({}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 1, @@ -948,7 +948,7 @@ def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypa df = dataframe.DataFrame({}) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 3, diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 7d3c14314c..368b4e5c7b 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -24,8 +24,8 @@ from bigframes import dataframe, dtypes from bigframes.ml import llm -SEM_OP_EXP_OPTION = "experiments.semantic_operators" -THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" +AI_OP_EXP_OPTION = "experiments.ai_operators" +THRESHOLD_OPTION = "compute.ai_ops_confirmation_threshold" class FakeGeminiTextGenerator(llm.GeminiTextGenerator): @@ -41,7 +41,7 @@ def test_experiment_off_raise_error(session): {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session ) - with bigframes.option_context(SEM_OP_EXP_OPTION, False), pytest.raises( + with bigframes.option_context(AI_OP_EXP_OPTION, False), pytest.raises( NotImplementedError ): df.ai @@ -56,7 +56,7 @@ def test_filter(session): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 50, @@ -82,7 +82,7 @@ def test_map(session): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 50, @@ -106,7 +106,7 @@ def test_join(session): ) with bigframes.option_context( - SEM_OP_EXP_OPTION, + AI_OP_EXP_OPTION, True, THRESHOLD_OPTION, 50, From 5db0d33c1f61503bdddb138ca7ec3e2786dcc2a7 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 20 Mar 2025 02:10:48 +0000 Subject: [PATCH 3/8] improve test coverage --- bigframes/operations/ai.py | 4 ++- bigframes/operations/semantics.py | 4 ++- tests/system/small/operations/test_ai.py | 28 +++++++++++++++++++ .../system/small/operations/test_semantics.py | 28 +++++++++++++++++++ 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index aef0d60b28..c26117a360 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -616,7 +616,9 @@ def top_k( # - 1.0: Selected as part of the top-k items # - -1.0: Excluded from the top-k items status_column = guid.generate_guid("status") - df[status_column] = bigframes.series.Series(None, dtype=dtypes.FLOAT_DTYPE) + df[status_column] = bigframes.series.Series( + None, dtype=dtypes.FLOAT_DTYPE, session=df._session + ) num_selected = 0 while num_selected < k: diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index f4b9d85103..a1cc38d9cb 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -896,7 +896,9 @@ def top_k( # - 1.0: Selected as part of the top-k items # - -1.0: Excluded from the top-k items status_column = guid.generate_guid("status") - df[status_column] = bigframes.series.Series(None, dtype=dtypes.FLOAT_DTYPE) + df[status_column] = bigframes.series.Series( + None, dtype=dtypes.FLOAT_DTYPE, session=df._session + ) num_selected = 0 while num_selected < k: diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 368b4e5c7b..65a4f7d553 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -120,3 +120,31 @@ def test_join(session): pd.DataFrame({"col_A": ["A"], "col_B": ["B"]}, dtype=dtypes.STRING_DTYPE), check_index_type=False, ) + + +def test_top_k(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["Document 1"]}, session=session + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + { + "col": ["A"], + }, + dtype=dtypes.STRING_DTYPE, + ), + check_index_type=False, + ) diff --git a/tests/system/small/operations/test_semantics.py b/tests/system/small/operations/test_semantics.py index 85777faaf6..7ce4fdd4ad 100644 --- a/tests/system/small/operations/test_semantics.py +++ b/tests/system/small/operations/test_semantics.py @@ -122,3 +122,31 @@ def test_join(session): pd.DataFrame({"col_A": ["A"], "col_B": ["B"]}, dtype=dtypes.STRING_DTYPE), check_index_type=False, ) + + +def test_top_k(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["Document 1"]}, session=session + ), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.semantics.top_k("top k of {col}", model, k=1).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + { + "col": ["A"], + }, + dtype=dtypes.STRING_DTYPE, + ), + check_index_type=False, + ) From 238d09868dd05cd4ab3a73b9d6d0b2107f48f888 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 20 Mar 2025 02:46:11 +0000 Subject: [PATCH 4/8] relax test condition --- tests/system/small/operations/test_ai.py | 11 +---------- tests/system/small/operations/test_semantics.py | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 65a4f7d553..6827ddba36 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -138,13 +138,4 @@ def test_top_k(session): ): result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() - pandas.testing.assert_frame_equal( - result, - pd.DataFrame( - { - "col": ["A"], - }, - dtype=dtypes.STRING_DTYPE, - ), - check_index_type=False, - ) + assert len(result) == 1 diff --git a/tests/system/small/operations/test_semantics.py b/tests/system/small/operations/test_semantics.py index 7ce4fdd4ad..8b520d8c03 100644 --- a/tests/system/small/operations/test_semantics.py +++ b/tests/system/small/operations/test_semantics.py @@ -140,13 +140,4 @@ def test_top_k(session): ): result = df.semantics.top_k("top k of {col}", model, k=1).to_pandas() - pandas.testing.assert_frame_equal( - result, - pd.DataFrame( - { - "col": ["A"], - }, - dtype=dtypes.STRING_DTYPE, - ), - check_index_type=False, - ) + assert len(result) == 1 From bcb5bcfcfba57d964f804606c44baae7916d5816 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Mar 2025 21:29:09 +0000 Subject: [PATCH 5/8] clean up semantics and add public docs --- bigframes/_config/compute_options.py | 14 ++++++- bigframes/_config/experiment_options.py | 5 +-- bigframes/dataframe.py | 5 ++- bigframes/operations/ai.py | 40 +++++++++---------- docs/reference/bigframes.pandas/frame.rst | 8 ++++ docs/templates/toc.yml | 2 + tests/system/large/operations/test_ai.py | 2 +- tests/system/small/operations/test_ai.py | 2 +- tests/unit/_config/test_experiment_options.py | 17 +++++++- 9 files changed, 64 insertions(+), 31 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 31ed4a2e4d..45ab9a2391 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -66,12 +66,22 @@ class ComputeOptions: engine to handle. However this comes at the cost of increase cost and latency. extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. - semmantic_ops_confirmation_threshold (int, optional): - Guards against unexepcted processing of large amount of rows by semantic operators. + semantic_ops_confirmation_threshold (int, optional): + Semantics operators are deprecated. Please use AI operators instead. + [Deprecated] Guards against unexepcted processing of large amount of rows by semantic operators. If the number of rows exceeds the threshold, the user will be asked to confirm their operations to resume. The default value is 0. Set the value to None to turn off the guard. semantic_ops_threshold_autofail (bool): + Semantics operators are deprecated. Please use AI operators instead. + [Deprecated] Guards against unexepcted processing of large amount of rows by semantic operators. + When set to True, the operation automatically fails without asking for user inputs. + ai_ops_confirmation_threshold (int, optional): + Guards against unexepcted processing of large amount of rows by semantic operators. + If the number of rows exceeds the threshold, the user will be asked to confirm + their operations to resume. The default value is 0. Set the value to None + to turn off the guard. + ai_ops_threshold_autofail (bool): Guards against unexepcted processing of large amount of rows by semantic operators. When set to True, the operation automatically fails without asking for user inputs. """ diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 6403f104a6..abe465de50 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -36,10 +36,9 @@ def semantic_operators(self) -> bool: def semantic_operators(self, value: bool): if value is True: msg = bfe.format_message( - "Semantic operators are still under experiments, and are subject " - "to change in the future." + "Semantic operators are deprecated, and will be removed in the future" ) - warnings.warn(msg, category=bfe.PreviewWarning) + warnings.warn(msg, category=FutureWarning) self._semantic_operators = value @property diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index fdcee4505b..7659c727d1 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4576,11 +4576,12 @@ def _throw_if_null_index(self, opname: str): @property def semantics(self): warnings.warn( - "The 'semantic' property will be removed. Please use 'ai' instead.", + "The 'semantics' property will be removed. Please use 'ai' instead.", FutureWarning, ) return bigframes.operations.semantics.Semantics(self) @property def ai(self): - return bigframes.operations.ai.Ai(self) + """Returns the accessor for AI operators.""" + return bigframes.operations.ai.AI(self) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index c26117a360..6c98339788 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -24,7 +24,7 @@ @log_adapter.class_logger -class Ai: +class AI: def __init__(self, df) -> None: import bigframes # Import in the function body to avoid circular imports. import bigframes.dataframe @@ -42,8 +42,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True - >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -78,7 +78,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals bigframes.pandas.DataFrame: DataFrame filtered by the instruction. Raises: - NotImplementedError: when the semantic operator experiment is off. + NotImplementedError: when the AI operator experiment is off. ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ @@ -156,8 +156,8 @@ def map( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True - >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -198,7 +198,7 @@ def map( bigframes.pandas.DataFrame: DataFrame with attached mapping results. Raises: - NotImplementedError: when the semantic operator experiment is off. + NotImplementedError: when the AI operator experiment is off. ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ @@ -279,8 +279,8 @@ def join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True - >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -412,7 +412,7 @@ def search( score_column: Optional[str] = None, ): """ - Performs semantic search on the DataFrame. + Performs AI semantic search on the DataFrame. ** Examples: ** @@ -420,8 +420,8 @@ def search( >>> bpd.options.display.progress_bar = None >>> import bigframes - >>> bigframes.options.experiments.semantic_operators = True - >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + >>> bigframes.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -521,8 +521,8 @@ def top_k( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True - >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -560,7 +560,7 @@ def top_k( bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. Raises: - NotImplementedError: when the semantic operator experiment is off. + NotImplementedError: when the AI operator experiment is off. ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ @@ -573,9 +573,7 @@ def top_k( if column not in self._df.columns: raise ValueError(f"Column {column} not found.") if len(columns) > 1: - raise NotImplementedError( - "Semantic aggregations are limited to a single column." - ) + raise NotImplementedError("AI top_k are limited to a single column.") if ground_with_google_search: msg = exceptions.format_message( @@ -726,8 +724,8 @@ def sim_join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True - >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -892,7 +890,7 @@ def _confirm_operation(row_count: int): # input function makes it less visible to the end user. print(f"This operation will process about {row_count} rows.") print( - "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." + "You can raise the confirmation threshold by setting `bigframes.options.compute.ai_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." ) print("Proceed? [Y/n]") reply = input().casefold() diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index bc9f714416..7ab86ddbd3 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -34,3 +34,11 @@ Struct handling :members: :inherited-members: :undoc-members: + +AI operators +^^^^^^^^^^^^ + +.. autoclass:: bigframes.operations.ai.AI + :members: + :inherited-members: + :undoc-members: \ No newline at end of file diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b4f513b11d..08420362a1 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -45,6 +45,8 @@ uid: bigframes.operations.plotting.PlotAccessor - name: StructAccessor uid: bigframes.operations.structs.StructFrameAccessor + - name: AI + uid: bigframes.operations.ai.AI - items: - name: DataFrameGroupBy uid: bigframes.core.groupby.DataFrameGroupBy diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index 2284be9238..04074a2ea6 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -925,7 +925,7 @@ def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): True, THRESHOLD_OPTION, 1, - "compute.semantic_ops_threshold_autofail", + "compute.ai_ops_threshold_autofail", True, ), pytest.raises(exceptions.OperationAbortedError): df.ai._confirm_operation(100) diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 6827ddba36..de6ba4b86c 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -14,7 +14,7 @@ # Note that the tests in this files uses fake models for deterministic results. -# Tests that use real LLM models are under system/large/test_semantcs.py +# Tests that use real LLM models are under system/large/test_ai.py import pandas as pd import pandas.testing diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py index 9735e494be..ce1dd0f146 100644 --- a/tests/unit/_config/test_experiment_options.py +++ b/tests/unit/_config/test_experiment_options.py @@ -27,12 +27,27 @@ def test_semantic_operators_default_false(): def test_semantic_operators_set_true_shows_warning(): options = experiment_options.ExperimentOptions() - with pytest.warns(bfe.PreviewWarning): + with pytest.warns(FutureWarning): options.semantic_operators = True assert options.semantic_operators is True +def test_ai_operators_default_false(): + options = experiment_options.ExperimentOptions() + + assert options.ai_operators is False + + +def test_ai_operators_set_true_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(bfe.PreviewWarning): + options.ai_operators = True + + assert options.ai_operators is True + + def test_blob_default_false(): options = experiment_options.ExperimentOptions() From 20b4cfe3ada51418a744eef707ac0a1b7ae82e4a Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Mar 2025 23:13:10 +0000 Subject: [PATCH 6/8] addressing comments --- bigframes/_config/compute_options.py | 22 ++++++++++++---------- bigframes/dataframe.py | 8 ++++---- bigframes/operations/ai.py | 2 +- docs/reference/bigframes.pandas/frame.rst | 2 +- docs/templates/toc.yml | 2 +- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 45ab9a2391..eb287f6065 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -60,29 +60,31 @@ class ComputeOptions: bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. + enable_multi_query_execution (bool, Options): If enabled, large queries may be factored into multiple smaller queries in order to avoid generating queries that are too complex for the query engine to handle. However this comes at the cost of increase cost and latency. + extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. + semantic_ops_confirmation_threshold (int, optional): - Semantics operators are deprecated. Please use AI operators instead. - [Deprecated] Guards against unexepcted processing of large amount of rows by semantic operators. - If the number of rows exceeds the threshold, the user will be asked to confirm - their operations to resume. The default value is 0. Set the value to None - to turn off the guard. + .. deprecated:: 1.42.0 + Semantic operators are deprecated. Please use AI operators instead + semantic_ops_threshold_autofail (bool): - Semantics operators are deprecated. Please use AI operators instead. - [Deprecated] Guards against unexepcted processing of large amount of rows by semantic operators. - When set to True, the operation automatically fails without asking for user inputs. + .. deprecated:: 1.42.0 + Semantic operators are deprecated. Please use AI operators instead + ai_ops_confirmation_threshold (int, optional): - Guards against unexepcted processing of large amount of rows by semantic operators. + Guards against unexpected processing of large amount of rows by semantic operators. If the number of rows exceeds the threshold, the user will be asked to confirm their operations to resume. The default value is 0. Set the value to None to turn off the guard. + ai_ops_threshold_autofail (bool): - Guards against unexepcted processing of large amount of rows by semantic operators. + Guards against unexpected processing of large amount of rows by semantic operators. When set to True, the operation automatically fails without asking for user inputs. """ diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7659c727d1..b960723188 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4575,13 +4575,13 @@ def _throw_if_null_index(self, opname: str): @property def semantics(self): - warnings.warn( - "The 'semantics' property will be removed. Please use 'ai' instead.", - FutureWarning, + msg = bfe.format_message( + "The 'semantics' property will be removed. Please use 'ai' instead." ) + warnings.warn(msg, category=bfe.PreviewWarning) return bigframes.operations.semantics.Semantics(self) @property def ai(self): """Returns the accessor for AI operators.""" - return bigframes.operations.ai.AI(self) + return bigframes.operations.ai.AIAccessor(self) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 6c98339788..414d5aee41 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -24,7 +24,7 @@ @log_adapter.class_logger -class AI: +class AIAccessor: def __init__(self, df) -> None: import bigframes # Import in the function body to avoid circular imports. import bigframes.dataframe diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index 7ab86ddbd3..4e231bd821 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -38,7 +38,7 @@ Struct handling AI operators ^^^^^^^^^^^^ -.. autoclass:: bigframes.operations.ai.AI +.. autoclass:: bigframes.operations.ai.AIAccessor :members: :inherited-members: :undoc-members: \ No newline at end of file diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 08420362a1..b00044b087 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -46,7 +46,7 @@ - name: StructAccessor uid: bigframes.operations.structs.StructFrameAccessor - name: AI - uid: bigframes.operations.ai.AI + uid: bigframes.operations.ai.AIAccessor - items: - name: DataFrameGroupBy uid: bigframes.core.groupby.DataFrameGroupBy From fd37607d785cac15a76bfeb5e5838b644860366f Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Mar 2025 23:18:16 +0000 Subject: [PATCH 7/8] use FutureWarning for deprecation --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b960723188..3d3a364b4c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4578,7 +4578,7 @@ def semantics(self): msg = bfe.format_message( "The 'semantics' property will be removed. Please use 'ai' instead." ) - warnings.warn(msg, category=bfe.PreviewWarning) + warnings.warn(msg, category=FutureWarning) return bigframes.operations.semantics.Semantics(self) @property From 6ae8dfeae30bffff7d1b5b942fa6f79929d665a7 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 25 Mar 2025 18:42:23 +0000 Subject: [PATCH 8/8] copy over recent top_k changes --- bigframes/operations/ai.py | 26 ++++++++++++-------------- bigframes/operations/semantics.py | 4 +--- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 414d5aee41..0ff92187cf 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -527,13 +527,17 @@ def top_k( >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") - >>> df = bpd.DataFrame({"Animals": ["Dog", "Bird", "Cat", "Horse"]}) + >>> df = bpd.DataFrame( + ... { + ... "Animals": ["Dog", "Bird", "Cat", "Horse"], + ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], + ... }) >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2) - Animals - 0 Dog - 2 Cat + Animals Sounds + 0 Dog Woof + 2 Cat Meow - [2 rows x 1 columns] + [2 rows x 2 columns] Args: instruction (str): @@ -573,7 +577,7 @@ def top_k( if column not in self._df.columns: raise ValueError(f"Column {column} not found.") if len(columns) > 1: - raise NotImplementedError("AI top_k are limited to a single column.") + raise NotImplementedError("AI top K are limited to a single column.") if ground_with_google_search: msg = exceptions.format_message( @@ -631,14 +635,8 @@ def top_k( ) num_selected += num_new_selected - df = ( - df[df[status_column] > 0] - .drop(["index", status_column], axis=1) - .rename(columns={"old_index": "index"}) - .set_index("index") - ) - df.index.name = None - return df + result_df: bigframes.dataframe.DataFrame = self._df.copy() + return result_df[df.set_index("old_index")[status_column] > 0.0] @staticmethod def _topk_partition( diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 2930fe178e..d1089f993e 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -857,9 +857,7 @@ def top_k( if column not in self._df.columns: raise ValueError(f"Column {column} not found.") if len(columns) > 1: - raise NotImplementedError( - "Semantic aggregations are limited to a single column." - ) + raise NotImplementedError("Semantic top K are limited to a single column.") if ground_with_google_search: msg = exceptions.format_message(