From 1053d56260eef1cff6e7c419f6c86be8f7e74373 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 15 Aug 2024 18:05:21 -0700 Subject: [PATCH 1/7] docs: Remove duplicate description for `kms_key_name` (#898) --- bigframes/_config/bigquery_options.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 34b9a3128f..502f103bb5 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -224,13 +224,9 @@ def use_regional_endpoints(self, value: bool): @property def kms_key_name(self) -> Optional[str]: """ - Customer-managed encryption key - used to control encryption of the data at rest in BigQuery. This key - takes the format projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY - Customer managed encryption key used to control encryption of the data-at-rest in BigQuery. This is of the format - projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption Customer-managed Cloud KMS keys From 92fdb937220146f770c4ce32f8317eb55ece434d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 16 Aug 2024 10:01:31 -0700 Subject: [PATCH 2/7] refactor: Extract DataFrame execution to class (#899) --- bigframes/core/blocks.py | 33 +- bigframes/core/compile/api.py | 12 +- bigframes/core/compile/compiled.py | 34 +- bigframes/dataframe.py | 51 ++- bigframes/series.py | 9 - bigframes/session/__init__.py | 417 +++++---------------- bigframes/session/_io/bigquery/__init__.py | 110 +++--- bigframes/session/clients.py | 15 + bigframes/session/executor.py | 346 +++++++++++++++++ bigframes/session/metrics.py | 88 +++++ bigframes/session/temp_storage.py | 96 +++++ tests/system/large/test_session.py | 18 +- tests/system/small/test_dataframe.py | 4 +- tests/system/small/test_encryption.py | 6 +- tests/system/small/test_session.py | 3 +- tests/unit/session/test_io_bigquery.py | 7 +- 16 files changed, 769 insertions(+), 480 deletions(-) create mode 100644 bigframes/session/executor.py create mode 100644 bigframes/session/metrics.py create mode 100644 bigframes/session/temp_storage.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 65a89b4516..9361543d5f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -488,12 +488,7 @@ def to_arrow( list(self.value_columns) + list(self.index_columns) ) - _, query_job = self.session._query_to_destination( - self.session._to_sql(expr, ordered=ordered), - list(self.index_columns), - api_name="cached", - do_clustering=False, - ) + _, query_job = self.session._execute(expr, ordered=ordered) results_iterator = query_job.result() pa_table = results_iterator.to_arrow() @@ -582,12 +577,7 @@ def to_pandas_batches( see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result""" dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, ordered=True), - list(self.index_columns), - api_name="cached", - do_clustering=False, - ) + _, query_job = self.session._execute(self.expr, ordered=True) results_iterator = query_job.result( page_size=page_size, max_results=max_results ) @@ -617,11 +607,8 @@ def _materialize_local( ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. - _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, ordered=materialize_options.ordered), - list(self.index_columns), - api_name="cached", - do_clustering=False, + _, query_job = self.session._execute( + self.expr, ordered=materialize_options.ordered ) results_iterator = query_job.result() @@ -797,8 +784,7 @@ def _compute_dry_run( self, value_keys: Optional[Iterable[str]] = None ) -> bigquery.QueryJob: expr = self._apply_value_keys_to_expr(value_keys=value_keys) - job_config = bigquery.QueryJobConfig(dry_run=True) - _, query_job = self.session._execute(expr, job_config=job_config, dry_run=True) + _, query_job = self.session._dry_run(expr) return query_job def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): @@ -2404,12 +2390,15 @@ def to_sql_query( def cached(self, *, force: bool = False, session_aware: bool = False) -> None: """Write the block to a session table.""" # use a heuristic for whether something needs to be cached - if (not force) and self.session._is_trivially_executable(self.expr): + if (not force) and self.session._executor._is_trivially_executable(self.expr): return elif session_aware: - self.session._cache_with_session_awareness(self.expr) + bfet_roots = [obj._block._expr.node for obj in self.session.objects] + self.session._executor._cache_with_session_awareness( + self.expr, session_forest=bfet_roots + ) else: - self.session._cache_with_cluster_cols( + self.session._executor._cache_with_cluster_cols( self.expr, cluster_cols=self.index_columns ) diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 468c5522d9..4e833411ae 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -13,7 +13,9 @@ # limitations under the License. from __future__ import annotations -from typing import Mapping, Tuple, TYPE_CHECKING +from typing import Mapping, Sequence, Tuple, TYPE_CHECKING + +import google.cloud.bigquery as bigquery import bigframes.core.compile.compiler as compiler @@ -58,11 +60,13 @@ def compile_ordered( def compile_raw( self, node: bigframes.core.nodes.BigFrameNode, - ) -> Tuple[str, bigframes.core.ordering.RowOrdering]: + ) -> Tuple[ + str, Sequence[bigquery.SchemaField], bigframes.core.ordering.RowOrdering + ]: """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" ir = self._compiler.compile_ordered_ir(node) - sql = ir.raw_sql() - return sql, ir._ordering + sql, schema = ir.raw_sql_and_schema() + return sql, schema, ir._ordering def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 5492502f21..512238440c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -20,11 +20,14 @@ from typing import Collection, Literal, Optional, Sequence import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops +import google.cloud.bigquery import ibis import ibis.backends.bigquery as ibis_bigquery +import ibis.backends.bigquery.datatypes import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes import ibis.expr.operations as ibis_ops +import ibis.expr.schema as ibis_schema import ibis.expr.types as ibis_types import pandas @@ -531,7 +534,8 @@ def __init__( for column in self._columns } self._hidden_ordering_column_names = { - column.get_name(): column for column in self._hidden_ordering_columns + typing.cast(str, column.get_name()): column + for column in self._hidden_ordering_columns } ### Validation value_col_ids = self._column_names.keys() @@ -947,14 +951,28 @@ def to_sql( ) return typing.cast(str, sql) - def raw_sql(self) -> str: - """Return sql with all hidden columns. Used to cache with ordering information.""" - return ibis_bigquery.Backend().compile( - self._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=True, - ) + def raw_sql_and_schema( + self, + ) -> typing.Tuple[str, typing.Sequence[google.cloud.bigquery.SchemaField]]: + """Return sql with all hidden columns. Used to cache with ordering information. + + Also returns schema, as the extra ordering columns are determined compile-time. + """ + all_columns = (*self.column_ids, *self._hidden_ordering_column_names.keys()) + as_ibis = self._to_ibis_expr( + ordering_mode="unordered", + expose_hidden_cols=True, + ).select(all_columns) + + # Ibis will produce non-nullable schema types, but bigframes should always be nullable + fixed_ibis_schema = ibis_schema.Schema.from_tuples( + (name, dtype.copy(nullable=True)) + for (name, dtype) in as_ibis.schema().items() + ) + bq_schema = ibis.backends.bigquery.datatypes.BigQuerySchema.from_ibis( + fixed_ibis_schema ) + return ibis_bigquery.Backend().compile(as_ibis), bq_schema def _to_ibis_expr( self, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 649b097e92..dabe85c923 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1214,7 +1214,6 @@ def to_arrow( category=bigframes.exceptions.PreviewWarning, ) - self._optimize_query_complexity() pa_table, query_job = self._block.to_arrow(ordered=ordered) self._set_internal_query_job(query_job) return pa_table @@ -1255,7 +1254,6 @@ def to_pandas( downsampled rows and all columns of this DataFrame. """ # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job - self._optimize_query_complexity() df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -1285,7 +1283,6 @@ def to_pandas_batches( form the original dataframe. Results stream from bigquery, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable """ - self._optimize_query_complexity() return self._block.to_pandas_batches( page_size=page_size, max_results=max_results ) @@ -3046,12 +3043,6 @@ def to_gbq( ordering_id: Optional[str] = None, clustering_columns: Union[pandas.Index, Iterable[typing.Hashable]] = (), ) -> str: - dispositions = { - "fail": bigquery.WriteDisposition.WRITE_EMPTY, - "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, - "append": bigquery.WriteDisposition.WRITE_APPEND, - } - temp_table_ref = None if destination_table is None: @@ -3063,7 +3054,7 @@ def to_gbq( ) if_exists = "replace" - temp_table_ref = self._session._random_table( + temp_table_ref = self._session._temp_storage_manager._random_table( # The client code owns this table reference now, so skip_cleanup=True # to not clean it up when we close the session. skip_cleanup=True, @@ -3086,10 +3077,11 @@ def to_gbq( if if_exists is None: if_exists = "fail" - if if_exists not in dispositions: + valid_if_exists = ["fail", "replace", "append"] + if if_exists not in valid_if_exists: raise ValueError( f"Got invalid value {repr(if_exists)} for if_exists. " - f"Valid options include None or one of {dispositions.keys()}." + f"Valid options include None or one of {valid_if_exists}." ) try: @@ -3101,16 +3093,25 @@ def to_gbq( clustering_columns, index=index ) - job_config = bigquery.QueryJobConfig( - write_disposition=dispositions[if_exists], - destination=bigquery.table.TableReference.from_string( - destination_table, - default_project=default_project, - ), - clustering_fields=clustering_fields if clustering_fields else None, + export_array, id_overrides = self._prepare_export( + index=index and self._has_index, ordering_id=ordering_id + ) + destination = bigquery.table.TableReference.from_string( + destination_table, + default_project=default_project, + ) + _, query_job = self._session._export( + export_array, + destination=destination, + col_id_overrides=id_overrides, + cluster_cols=clustering_fields, + if_exists=if_exists, ) + self._set_internal_query_job(query_job) - self._run_io_query(index=index, ordering_id=ordering_id, job_config=job_config) + # The query job should have finished, so there should be always be a result table. + result_table = query_job.destination + assert result_table is not None if temp_table_ref: bigframes.session._io.bigquery.set_table_expiration( @@ -3402,19 +3403,16 @@ def _run_io_query( self, index: bool, ordering_id: Optional[str] = None, - job_config: Optional[bigquery.job.QueryJobConfig] = None, ) -> bigquery.TableReference: """Executes a query job presenting this dataframe and returns the destination table.""" session = self._block.expr.session - self._optimize_query_complexity() export_array, id_overrides = self._prepare_export( index=index and self._has_index, ordering_id=ordering_id ) _, query_job = session._execute( export_array, - job_config=job_config, ordered=False, col_id_overrides=id_overrides, ) @@ -3669,13 +3667,6 @@ def _cached(self, *, force: bool = False) -> DataFrame: self._block.cached(force=force) return self - def _optimize_query_complexity(self): - """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees. - May generate many queries and take substantial time to execute. - """ - # TODO: Move all this to session - self._session._simplify_with_caching(self._block.expr) - _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries") @validations.requires_ordering() diff --git a/bigframes/series.py b/bigframes/series.py index 069c469a85..7ba4858b5e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -188,7 +188,6 @@ def __len__(self): __len__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__len__) def __iter__(self) -> typing.Iterator: - self._optimize_query_complexity() return itertools.chain.from_iterable( map(lambda x: x.squeeze(axis=1), self._block.to_pandas_batches()) ) @@ -358,7 +357,6 @@ def to_pandas( pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. """ - self._optimize_query_complexity() df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -1892,13 +1890,6 @@ def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series: self._block.cached(force=force, session_aware=session_aware) return self - def _optimize_query_complexity(self): - """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees. - May generate many queries and take substantial time to execute. - """ - # TODO: Move all this to session - self._block.session._simplify_with_caching(self._block.expr) - def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: return pandas.api.types.is_list_like(obj) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8cef869a32..0868ef202a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -20,7 +20,6 @@ import datetime import itertools import logging -import math import os import secrets import typing @@ -40,7 +39,6 @@ Tuple, Union, ) -import uuid import warnings import weakref @@ -81,12 +79,8 @@ import bigframes.core.blocks as blocks import bigframes.core.compile import bigframes.core.guid -import bigframes.core.nodes as nodes -import bigframes.core.ordering as order import bigframes.core.pruning import bigframes.core.schema as schemata -import bigframes.core.tree_properties as traversals -import bigframes.core.tree_properties as tree_properties import bigframes.core.utils as utils # Even though the ibis.backends.bigquery import is unused, it's needed @@ -100,7 +94,10 @@ import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.clients +import bigframes.session.executor +import bigframes.session.metrics import bigframes.session.planner +import bigframes.session.temp_storage import bigframes.version # Avoid circular imports. @@ -112,8 +109,6 @@ _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" -_TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}" - _MAX_CLUSTER_COLUMNS = 4 # TODO(swast): Need to connect to regional endpoints when performing remote @@ -133,11 +128,6 @@ # Also must assume that text encoding as literals is much less efficient than in-memory representation. MAX_INLINE_DF_BYTES = 5000 -# Max complexity that should be executed as a single query -QUERY_COMPLEXITY_LIMIT = 1e7 -# Number of times to factor out subqueries before giving up. -MAX_SUBTREE_FACTORINGS = 5 - logger = logging.getLogger(__name__) # Excludes geography, bytes, and nested (array, struct) datatypes @@ -277,7 +267,6 @@ def __init__( # only needs to be unique among sessions created by the same user # at the same time in the same region self._session_id: str = "session" + secrets.token_hex(3) - self._table_ids: List[str] = [] # store table ids and delete them when the session is closed self._objects: list[ @@ -289,14 +278,6 @@ def __init__( ] ] ] = [] - self._cached_executions: weakref.WeakKeyDictionary[ - nodes.BigFrameNode, nodes.BigFrameNode - ] = weakref.WeakKeyDictionary() - - # performance logging - self._bytes_processed_sum = 0 - self._slot_millis_sum = 0 - self._execution_count = 0 # Whether this session treats objects as totally ordered. # Will expose as feature later, only False for internal testing self._strictly_ordered: bool = context.ordering_mode != "partial" @@ -312,12 +293,24 @@ def __init__( if self._strictly_ordered else bigframes.enums.DefaultIndexKind.NULL ) - self._compiler = bigframes.core.compile.SQLCompiler( - strict=self._strictly_ordered - ) self._allow_ambiguity = not self._strictly_ordered + self._metrics = bigframes.session.metrics.ExecutionMetrics() self._remote_function_session = bigframes_rf_session.RemoteFunctionSession() + self._temp_storage_manager = ( + bigframes.session.temp_storage.TemporaryGbqStorageManager( + self._clients_provider.bqclient, + dataset=self._anonymous_dataset, + session_id=self._session_id, + kms_key=self._bq_kms_key_name, + ) + ) + self._executor = bigframes.session.executor.BigQueryCachingExecutor( + bqclient=self._clients_provider.bqclient, + storage_manager=self._temp_storage_manager, + strictly_ordered=self._strictly_ordered, + metrics=self._metrics, + ) @property def bqclient(self): @@ -373,48 +366,26 @@ def _project(self): @property def bytes_processed_sum(self): """The sum of all bytes processed by bigquery jobs using this session.""" - return self._bytes_processed_sum + return self._metrics.bytes_processed @property def slot_millis_sum(self): """The sum of all slot time used by bigquery jobs in this session.""" - return self._slot_millis_sum + return self._metrics.slot_millis @property def _allows_ambiguity(self) -> bool: return self._allow_ambiguity - def _add_bytes_processed(self, amount: int): - """Increment bytes_processed_sum by amount.""" - self._bytes_processed_sum += amount - - def _add_slot_millis(self, amount: int): - """Increment slot_millis_sum by amount.""" - self._slot_millis_sum += amount - - def _add_execution(self, amount: int = 1): - """Increment slot_millis_sum by amount.""" - self._execution_count += amount - def __hash__(self): # Stable hash needed to use in expression tree return hash(str(self._anonymous_dataset)) - def _clean_up_tables(self): - """Delete tables that were created with this session's session_id.""" - client = self.bqclient - project_id = self._anonymous_dataset.project - dataset_id = self._anonymous_dataset.dataset_id - - for table_id in self._table_ids: - full_id = ".".join([project_id, dataset_id, table_id]) - client.delete_table(full_id, not_found_ok=True) - def close(self): """Delete resources that were created with this session's session_id. This includes BigQuery tables, remote functions and cloud functions serving the remote functions.""" - self._clean_up_tables() + self._temp_storage_manager.clean_up_tables() self._remote_function_session.clean_up( self.bqclient, self.cloudfunctionsclient, self.session_id ) @@ -485,7 +456,6 @@ def _query_to_destination( configuration: dict = {"query": {"useQueryCache": True}}, do_clustering=True, ) -> Tuple[Optional[bigquery.TableReference], bigquery.QueryJob]: - self._add_execution(1) # If a dry_run indicates this is not a query type job, then don't # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. dry_run_config = bigquery.QueryJobConfig() @@ -500,16 +470,15 @@ def _query_to_destination( # Create a table to workaround BigQuery 10 GB query results limit. See: # internal issue 303057336. # Since we have a `statement_type == 'SELECT'`, schema should be populated. - schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema) + schema = dry_run_job.schema + assert schema is not None if do_clustering: - cluster_cols = [ - item.name - for item in schema - if (item.name in index_cols) and _can_cluster_bq(item) - ][:_MAX_CLUSTER_COLUMNS] + cluster_cols = bf_io_bigquery.select_cluster_cols( + schema, cluster_candidates=index_cols + ) else: cluster_cols = [] - temp_table = self._create_empty_temp_table(schema, cluster_cols) + temp_table = self._temp_storage_manager.create_temp_table(schema, cluster_cols) timeout_ms = configuration.get("jobTimeoutMs") or configuration["query"].get( "timeoutMs" @@ -1211,8 +1180,7 @@ def _read_pandas_load_job( pandas_dataframe_copy.columns = pandas.Index(new_col_ids) pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) - job_config = self._prepare_load_job_config() - + job_config = bigquery.LoadJobConfig() # Specify the datetime dtypes, which is auto-detected as timestamp types. schema: list[bigquery.SchemaField] = [] for column, dtype in zip(new_col_ids, pandas_dataframe.dtypes): @@ -1228,7 +1196,7 @@ def _read_pandas_load_job( job_config.labels = {"bigframes-api": api_name} - load_table_destination = self._random_table() + load_table_destination = self._temp_storage_manager._random_table() load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -1289,7 +1257,7 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ) -> dataframe.DataFrame: - table = self._random_table() + table = self._temp_storage_manager._random_table() if engine is not None and engine == "bigquery": if any(param is not None for param in (dtype, names)): @@ -1349,7 +1317,7 @@ def read_csv( f"{constants.FEEDBACK_LINK}" ) - job_config = self._prepare_load_job_config() + job_config = bigquery.LoadJobConfig() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.CSV job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY @@ -1426,10 +1394,10 @@ def read_parquet( *, engine: str = "auto", ) -> dataframe.DataFrame: - table = self._random_table() + table = self._temp_storage_manager._random_table() if engine == "bigquery": - job_config = self._prepare_load_job_config() + job_config = bigquery.LoadJobConfig() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.PARQUET job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY @@ -1463,7 +1431,7 @@ def read_json( engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", **kwargs, ) -> dataframe.DataFrame: - table = self._random_table() + table = self._temp_storage_manager._random_table() if engine == "bigquery": @@ -1487,7 +1455,7 @@ def read_json( "'lines' keyword is only valid when 'orient' is 'records'." ) - job_config = self._prepare_load_job_config() + job_config = bigquery.LoadJobConfig() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY @@ -1553,25 +1521,6 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) - def _create_empty_temp_table( - self, - schema: Iterable[bigquery.SchemaField], - cluster_cols: List[str], - ) -> bigquery.TableReference: - # Can't set a table in _SESSION as destination via query job API, so we - # run DDL, instead. - expiration = ( - datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION - ) - - table = bf_io_bigquery.create_temp_table( - self, - expiration, - schema=schema, - cluster_columns=cluster_cols, - ) - return bigquery.TableReference.from_string(table) - def _sql_to_temp_table( self, sql: str, @@ -1851,42 +1800,6 @@ def read_gbq_function( session=self, ) - def _prepare_query_job_config( - self, - job_config: Optional[bigquery.QueryJobConfig] = None, - ) -> bigquery.QueryJobConfig: - if job_config is None: - job_config = bigquery.QueryJobConfig() - else: - # Create a copy so that we don't mutate the original config passed - job_config = typing.cast( - bigquery.QueryJobConfig, - bigquery.QueryJobConfig.from_api_repr(job_config.to_api_repr()), - ) - - if bigframes.options.compute.maximum_bytes_billed is not None: - job_config.maximum_bytes_billed = ( - bigframes.options.compute.maximum_bytes_billed - ) - - if self._bq_kms_key_name: - job_config.destination_encryption_configuration = ( - bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name) - ) - - return job_config - - def _prepare_load_job_config(self) -> bigquery.LoadJobConfig: - # Create a copy so that we don't mutate the original config passed - job_config = bigquery.LoadJobConfig() - - if self._bq_kms_key_name: - job_config.destination_encryption_configuration = ( - bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name) - ) - - return job_config - def _prepare_copy_job_config(self) -> bigquery.CopyJobConfig: # Create a copy so that we don't mutate the original config passed job_config = bigquery.CopyJobConfig() @@ -1908,26 +1821,23 @@ def _start_query( ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ Starts BigQuery query job and waits for results. + + Do not execute dataframe through this API, instead use the executor. """ - job_config = self._prepare_query_job_config(job_config) - if not self._strictly_ordered: - job_config.labels = {"bigframes-mode": "unordered"} - try: - return bigframes.session._io.bigquery.start_query_with_client( - self, - sql, - job_config, - max_results, - timeout, - api_name=api_name, + job_config = bigquery.QueryJobConfig() if job_config is None else job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + # Maybe this should be pushed down into start_query_with_client + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed ) - except google.api_core.exceptions.BadRequest as e: - # Unfortunately, this error type does not have a separate error code or exception type - if "Resources exceeded during query execution" in e.message: - new_message = "Computation is too complex to execute as a single query. Try using DataFrame.cache() on intermediate results, or setting bigframes.options.compute.enable_multi_query_execution." - raise bigframes.exceptions.QueryComplexityError(new_message) from e - else: - raise + return bf_io_bigquery.start_query_with_client( + self.bqclient, + sql, + job_config, + max_results, + timeout, + api_name=api_name, + ) def _start_query_ml_ddl( self, @@ -1937,162 +1847,60 @@ def _start_query_ml_ddl( Starts BigQuery ML DDL query job (CREATE MODEL/ALTER MODEL/...) and waits for results. """ - job_config = self._prepare_query_job_config() + job_config = typing.cast(bigquery.QueryJobConfig, bigquery.QueryJobConfig()) + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) # BQML expects kms_key_name through OPTIONS and not through job config, # so we must reset any encryption set in the job config # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None - return bigframes.session._io.bigquery.start_query_with_client( - self, sql, job_config - ) - - def _cache_with_cluster_cols( - self, array_value: core.ArrayValue, cluster_cols: typing.Sequence[str] - ): - """Executes the query and uses the resulting table to rewrite future executions.""" - # TODO: Use this for all executions? Problem is that caching materializes extra - # ordering columns - - sql, ordering_info = self._compiler.compile_raw( - self._with_cached_executions(array_value.node) - ) - tmp_table = self._sql_to_temp_table( - sql, cluster_cols=cluster_cols, api_name="cached" - ) - cached_replacement = array_value.as_cached( - cache_table=self.bqclient.get_table(tmp_table), - ordering=ordering_info, - ).node - self._cached_executions[array_value.node] = cached_replacement - - def _cache_with_offsets(self, array_value: core.ArrayValue): - """Executes the query and uses the resulting table to rewrite future executions.""" - # TODO: Use this for all executions? Problem is that caching materializes extra - # ordering columns - if not self._strictly_ordered: - raise ValueError( - "Caching with offsets only supported in strictly ordered mode." - ) - offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") - sql = self._compiler.compile_unordered( - self._with_cached_executions( - array_value.promote_offsets(offset_column).node - ) - ) - - tmp_table = self._sql_to_temp_table( - sql, cluster_cols=[offset_column], api_name="cached" - ) - cached_replacement = array_value.as_cached( - cache_table=self.bqclient.get_table(tmp_table), - ordering=order.TotalOrdering.from_offset_col(offset_column), - ).node - self._cached_executions[array_value.node] = cached_replacement - - def _cache_with_session_awareness(self, array_value: core.ArrayValue) -> None: - # this is the occurence count across the whole session - forest = [obj._block.expr.node for obj in self.objects] - # These node types are cheap to re-compute - target, cluster_cols = bigframes.session.planner.session_aware_cache_plan( - array_value.node, forest - ) - if len(cluster_cols) > 0: - self._cache_with_cluster_cols(core.ArrayValue(target), cluster_cols) - elif self._strictly_ordered: - self._cache_with_offsets(core.ArrayValue(target)) - else: - self._cache_with_cluster_cols(core.ArrayValue(target), []) - - def _simplify_with_caching(self, array_value: core.ArrayValue): - """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" - # Apply existing caching first - if not bigframes.options.compute.enable_multi_query_execution: - return - - for _ in range(MAX_SUBTREE_FACTORINGS): - node_with_cache = self._with_cached_executions(array_value.node) - if node_with_cache.planning_complexity < QUERY_COMPLEXITY_LIMIT: - return - - did_cache = self._cache_most_complex_subtree(array_value.node) - if not did_cache: - return - - def _cache_most_complex_subtree(self, node: nodes.BigFrameNode) -> bool: - # TODO: If query fails, retry with lower complexity limit - selection = traversals.select_cache_target( - node, - min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), - max_complexity=QUERY_COMPLEXITY_LIMIT, - cache=dict(self._cached_executions), - # Heuristic: subtree_compleixty * (copies of subtree)^2 - heuristic=lambda complexity, count: math.log(complexity) - + 2 * math.log(count), - ) - if selection is None: - # No good subtrees to cache, just return original tree - return False - - self._cache_with_cluster_cols(core.ArrayValue(selection), []) - return True - - def _with_cached_executions(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: - return traversals.replace_nodes(node, (dict(self._cached_executions))) - - def _is_trivially_executable(self, array_value: core.ArrayValue): - """ - Can the block be evaluated very cheaply? - If True, the array_value probably is not worth caching. - """ - # Once rewriting is available, will want to rewrite before - # evaluating execution cost. - return traversals.is_trivially_executable( - self._with_cached_executions(array_value.node) - ) + return bf_io_bigquery.start_query_with_client(self.bqclient, sql, job_config) def _execute( self, array_value: core.ArrayValue, - job_config: Optional[bigquery.job.QueryJobConfig] = None, *, ordered: bool = True, - dry_run=False, col_id_overrides: Mapping[str, str] = {}, ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - if not dry_run: - self._add_execution(1) - sql = self._to_sql( - array_value, ordered=ordered, col_id_overrides=col_id_overrides - ) # type:ignore - if job_config is None: - job_config = bigquery.QueryJobConfig(dry_run=dry_run) - else: - job_config.dry_run = dry_run + return self._executor.execute( + array_value, + ordered=ordered, + col_id_overrides=col_id_overrides, + ) - # TODO(swast): plumb through the api_name of the user-facing api that - # caused this query. - return self._start_query( - sql=sql, - job_config=job_config, + def _export( + self, + array_value: core.ArrayValue, + destination: bigquery.TableReference, + *, + if_exists: Literal["fail", "replace", "append"] = "fail", + col_id_overrides: Mapping[str, str] = {}, + cluster_cols: Sequence[str], + ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + # Note: cluster_cols use pre-override column ids + return self._executor.export( + array_value, + destination=destination, + col_id_overrides=col_id_overrides, + if_exists=if_exists, + cluster_cols=cluster_cols, ) + def _dry_run( + self, array_value: core.ArrayValue, ordered: bool = True + ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + return self._executor.dry_run(array_value, ordered=ordered) + def _peek( self, array_value: core.ArrayValue, n_rows: int ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """A 'peek' efficiently accesses a small number of rows in the dataframe.""" - if not tree_properties.peekable(self._with_cached_executions(array_value.node)): - warnings.warn("Peeking this value cannot be done efficiently.") - sql = self._compiler.compile_peek( - self._with_cached_executions(array_value.node), n_rows - ) - - # TODO(swast): plumb through the api_name of the user-facing api that - # caused this query. - return self._start_query( - sql=sql, - ) + return self._executor.peek(array_value, n_rows) def _to_sql( self, @@ -2102,18 +1910,9 @@ def _to_sql( ordered: bool = False, enable_cache: bool = True, ) -> str: - if offset_column: - array_value = array_value.promote_offsets(offset_column) - node = ( - self._with_cached_executions(array_value.node) - if enable_cache - else array_value.node + return self._executor.to_sql( + array_value, offset_column, col_id_overrides, ordered, enable_cache ) - if ordered: - return self._compiler.compile_ordered( - node, col_id_overrides=col_id_overrides - ) - return self._compiler.compile_unordered(node, col_id_overrides=col_id_overrides) def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) @@ -2135,57 +1934,11 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() - def _random_table(self, skip_cleanup: bool = False) -> bigquery.TableReference: - """Generate a random table ID with BigQuery DataFrames prefix. - - The generated ID will be stored and checked for deletion when the - session is closed, unless skip_cleanup is True. - - Args: - skip_cleanup (bool, default False): - If True, do not add the generated ID to the list of tables - to clean up when the session is closed. - - Returns: - google.cloud.bigquery.TableReference: - Fully qualified table ID of a table that doesn't exist. - """ - dataset = self._anonymous_dataset - session_id = self.session_id - now = datetime.datetime.now(datetime.timezone.utc) - random_id = uuid.uuid4().hex - table_id = _TEMP_TABLE_ID_FORMAT.format( - date=now.strftime("%Y%m%d"), session_id=session_id, random_id=random_id - ) - if not skip_cleanup: - self._table_ids.append(table_id) - return dataset.table(table_id) - def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) -def _can_cluster_bq(field: bigquery.SchemaField): - # https://cloud.google.com/bigquery/docs/clustered-tables - # Notably, float is excluded - type_ = field.field_type - return type_ in ( - "INTEGER", - "INT64", - "STRING", - "NUMERIC", - "DECIMAL", - "BIGNUMERIC", - "BIGDECIMAL", - "DATE", - "DATETIME", - "TIMESTAMP", - "BOOL", - "BOOLEAN", - ) - - def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: """ For backwards-compatibility, convert any previously client-side only diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 3a33352a67..a77729cef9 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -18,7 +18,6 @@ import datetime import itertools -import os import re import textwrap import types @@ -35,14 +34,17 @@ import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql import bigframes.formatting_helpers as formatting_helpers +import bigframes.session.metrics + +CHECK_DRIVE_PERMISSIONS = "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." + IO_ORDERING_ID = "bqdf_row_nums" MAX_LABELS_COUNT = 64 _LIST_TABLES_LIMIT = 10000 # calls to bqclient.list_tables # will be limited to this many tables -LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" -CHECK_DRIVE_PERMISSIONS = "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." +_MAX_CLUSTER_COLUMNS = 4 def create_job_configs_labels( @@ -129,24 +131,28 @@ def table_ref_to_sql(table: bigquery.TableReference) -> str: def create_temp_table( - session: bigframes.session.Session, + bqclient: bigquery.Client, + table_ref: bigquery.TableReference, expiration: datetime.datetime, *, schema: Optional[Iterable[bigquery.SchemaField]] = None, cluster_columns: Optional[list[str]] = None, + kms_key: Optional[str] = None, ) -> str: """Create an empty table with an expiration in the desired session. The table will be deleted when the session is closed or the expiration is reached. """ - bqclient: bigquery.Client = session.bqclient - table_ref = session._random_table() destination = bigquery.Table(table_ref) destination.expires = expiration destination.schema = schema if cluster_columns: destination.clustering_fields = cluster_columns + if kms_key: + destination.encryption_configuration = bigquery.EncryptionConfiguration( + kms_key_name=kms_key + ) # Ok if already exists, since this will only happen from retries internal to this method # as the requested table id has a random UUID4 component. bqclient.create_table(destination, exists_ok=True) @@ -222,17 +228,17 @@ def add_labels(job_config, api_name: Optional[str] = None): def start_query_with_client( - session: bigframes.session.Session, + bq_client: bigquery.Client, sql: str, job_config: bigquery.job.QueryJobConfig, max_results: Optional[int] = None, timeout: Optional[float] = None, api_name: Optional[str] = None, + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ Starts query job and waits for results. """ - bq_client: bigquery.Client = session.bqclient add_labels(job_config, api_name=api_name) try: @@ -250,64 +256,11 @@ def start_query_with_client( else: results_iterator = query_job.result(max_results=max_results) - stats = get_performance_stats(query_job) - if stats is not None: - bytes_processed, slot_millis = stats - session._add_bytes_processed(bytes_processed) - session._add_slot_millis(slot_millis) - if LOGGING_NAME_ENV_VAR in os.environ: - # when running notebooks via pytest nbmake - write_stats_to_disk(bytes_processed, slot_millis) - + if metrics is not None: + metrics.count_job_stats(query_job) return results_iterator, query_job -def get_performance_stats(query_job: bigquery.QueryJob) -> Optional[Tuple[int, int]]: - """Parse the query job for performance stats. - - Return None if the stats do not reflect real work done in bigquery. - """ - bytes_processed = query_job.total_bytes_processed - if not isinstance(bytes_processed, int): - return None # filter out mocks - if query_job.configuration.dry_run: - # dry run stats are just predictions of the real run - bytes_processed = 0 - - slot_millis = query_job.slot_millis - if not isinstance(slot_millis, int): - return None # filter out mocks - if query_job.configuration.dry_run: - # dry run stats are just predictions of the real run - slot_millis = 0 - - return bytes_processed, slot_millis - - -def write_stats_to_disk(bytes_processed: int, slot_millis: int): - """For pytest runs only, log information about the query job - to a file in order to create a performance report. - """ - if LOGGING_NAME_ENV_VAR not in os.environ: - raise EnvironmentError( - "Environment variable {env_var} is not set".format( - env_var=LOGGING_NAME_ENV_VAR - ) - ) - test_name = os.environ[LOGGING_NAME_ENV_VAR] - current_directory = os.getcwd() - - # store bytes processed - bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") - with open(bytes_file, "a") as f: - f.write(str(bytes_processed) + "\n") - - # store slot milliseconds - bytes_file = os.path.join(current_directory, test_name + ".slotmillis") - with open(bytes_file, "a") as f: - f.write(str(slot_millis) + "\n") - - def delete_tables_matching_session_id( client: bigquery.Client, dataset: bigquery.DatasetReference, session_id: str ) -> None: @@ -504,3 +457,34 @@ def compile_filters(filters: third_party_pandas_gbq.FiltersType) -> str: filter_string = and_expression return filter_string + + +def select_cluster_cols( + schema: typing.Sequence[bigquery.SchemaField], + cluster_candidates: typing.Sequence[str], +) -> typing.Sequence[str]: + return [ + item.name + for item in schema + if (item.name in cluster_candidates) and _can_cluster_bq(item) + ][:_MAX_CLUSTER_COLUMNS] + + +def _can_cluster_bq(field: bigquery.SchemaField): + # https://cloud.google.com/bigquery/docs/clustered-tables + # Notably, float is excluded + type_ = field.field_type + return type_ in ( + "INTEGER", + "INT64", + "STRING", + "NUMERIC", + "DECIMAL", + "BIGNUMERIC", + "BIGDECIMAL", + "DATE", + "DATETIME", + "TIMESTAMP", + "BOOL", + "BOOLEAN", + ) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 85664d8dc8..7b53d40f74 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -144,6 +144,21 @@ def _create_bigquery_client(self): project=self._project, location=self._location, ) + if self._bq_kms_key_name: + # Note: Key configuration only applies automatically to load and query jobs, not copy jobs. + encryption_config = bigquery.EncryptionConfiguration( + kms_key_name=self._bq_kms_key_name + ) + default_load_job_config = bigquery.LoadJobConfig() + default_query_job_config = bigquery.QueryJobConfig() + default_load_job_config.destination_encryption_configuration = ( + encryption_config + ) + default_query_job_config.destination_encryption_configuration = ( + encryption_config + ) + bq_client.default_load_job_config = default_load_job_config + bq_client.default_query_job_config = default_query_job_config return bq_client diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py new file mode 100644 index 0000000000..539658a18c --- /dev/null +++ b/bigframes/session/executor.py @@ -0,0 +1,346 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math +from typing import cast, Iterable, Literal, Mapping, Optional, Sequence, Tuple +import warnings +import weakref + +import google.api_core.exceptions +import google.cloud.bigquery as bigquery +import google.cloud.bigquery.job as bq_job + +import bigframes.core +import bigframes.core.compile +import bigframes.core.guid +import bigframes.core.nodes as nodes +import bigframes.core.ordering as order +import bigframes.core.tree_properties as tree_properties +import bigframes.formatting_helpers as formatting_helpers +import bigframes.session._io.bigquery as bq_io +import bigframes.session.metrics +import bigframes.session.planner +import bigframes.session.temp_storage + +# Max complexity that should be executed as a single query +QUERY_COMPLEXITY_LIMIT = 1e7 +# Number of times to factor out subqueries before giving up. +MAX_SUBTREE_FACTORINGS = 5 + +_MAX_CLUSTER_COLUMNS = 4 + + +class BigQueryCachingExecutor: + """Computes BigFrames values using BigQuery Engine. + + This executor can cache expressions. If those expressions are executed later, this session + will re-use the pre-existing results from previous executions. + + This class is not thread-safe. + """ + + def __init__( + self, + bqclient: bigquery.Client, + storage_manager: bigframes.session.temp_storage.TemporaryGbqStorageManager, + strictly_ordered: bool = True, + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + ): + self.bqclient = bqclient + self.storage_manager = storage_manager + self.compiler: bigframes.core.compile.SQLCompiler = ( + bigframes.core.compile.SQLCompiler(strict=strictly_ordered) + ) + self.strictly_ordered: bool = strictly_ordered + self._cached_executions: weakref.WeakKeyDictionary[ + nodes.BigFrameNode, nodes.BigFrameNode + ] = weakref.WeakKeyDictionary() + self.metrics = metrics + + def to_sql( + self, + array_value: bigframes.core.ArrayValue, + offset_column: Optional[str] = None, + col_id_overrides: Mapping[str, str] = {}, + ordered: bool = False, + enable_cache: bool = True, + ) -> str: + """ + Convert an ArrayValue to a sql query that will yield its value. + """ + if offset_column: + array_value = array_value.promote_offsets(offset_column) + node = ( + self._with_cached_executions(array_value.node) + if enable_cache + else array_value.node + ) + if ordered: + return self.compiler.compile_ordered( + node, col_id_overrides=col_id_overrides + ) + return self.compiler.compile_unordered(node, col_id_overrides=col_id_overrides) + + def execute( + self, + array_value: bigframes.core.ArrayValue, + *, + ordered: bool = True, + col_id_overrides: Mapping[str, str] = {}, + ): + """ + Execute the ArrayValue, storing the result to a temporary session-owned table. + """ + if bigframes.options.compute.enable_multi_query_execution: + self._simplify_with_caching(array_value) + + sql = self.to_sql( + array_value, ordered=ordered, col_id_overrides=col_id_overrides + ) + job_config = bigquery.QueryJobConfig() + # TODO(swast): plumb through the api_name of the user-facing api that + # caused this query. + return self._run_execute_query( + sql=sql, + job_config=job_config, + ) + + def export( + self, + array_value, + col_id_overrides: Mapping[str, str], + destination: bigquery.TableReference, + if_exists: Literal["fail", "replace", "append"] = "fail", + cluster_cols: Sequence[str] = [], + ): + """ + Export the ArrayValue to an existing BigQuery table. + """ + dispositions = { + "fail": bigquery.WriteDisposition.WRITE_EMPTY, + "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, + "append": bigquery.WriteDisposition.WRITE_APPEND, + } + sql = self.to_sql(array_value, ordered=False, col_id_overrides=col_id_overrides) + job_config = bigquery.QueryJobConfig( + write_disposition=dispositions[if_exists], + destination=destination, + clustering_fields=cluster_cols if cluster_cols else None, + ) + # TODO(swast): plumb through the api_name of the user-facing api that + # caused this query. + return self._run_execute_query( + sql=sql, + job_config=job_config, + ) + + def dry_run(self, array_value: bigframes.core.ArrayValue, ordered: bool = True): + """ + Dry run executing the ArrayValue. + + Does not actually execute the data but will get stats and indicate any invalid query errors. + """ + sql = self.to_sql(array_value, ordered=ordered) + job_config = bigquery.QueryJobConfig(dry_run=True) + bq_io.add_labels(job_config) + query_job = self.bqclient.query(sql, job_config=job_config) + results_iterator = query_job.result() + return results_iterator, query_job + + def peek( + self, array_value: bigframes.core.ArrayValue, n_rows: int + ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """A 'peek' efficiently accesses a small number of rows in the dataframe.""" + if not tree_properties.peekable(self._with_cached_executions(array_value.node)): + warnings.warn("Peeking this value cannot be done efficiently.") + sql = self.compiler.compile_peek( + self._with_cached_executions(array_value.node), n_rows + ) + + # TODO(swast): plumb through the api_name of the user-facing api that + # caused this query. + return self._run_execute_query( + sql=sql, + ) + + # Helpers + def _run_execute_query( + self, + sql: str, + job_config: Optional[bq_job.QueryJobConfig] = None, + api_name: Optional[str] = None, + ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """ + Starts BigQuery query job and waits for results. + """ + job_config = bq_job.QueryJobConfig() if job_config is None else job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + # Note: add_labels is global scope which may have unexpected effects + bq_io.add_labels(job_config, api_name=api_name) + + if not self.strictly_ordered: + job_config.labels["bigframes-mode"] = "unordered" + try: + query_job = self.bqclient.query(sql, job_config=job_config) + opts = bigframes.options.display + if opts.progress_bar is not None and not query_job.configuration.dry_run: + results_iterator = formatting_helpers.wait_for_query_job( + query_job, progress_bar=opts.progress_bar + ) + else: + results_iterator = query_job.result() + + if self.metrics is not None: + self.metrics.count_job_stats(query_job) + return results_iterator, query_job + + except google.api_core.exceptions.BadRequest as e: + # Unfortunately, this error type does not have a separate error code or exception type + if "Resources exceeded during query execution" in e.message: + new_message = "Computation is too complex to execute as a single query. Try using DataFrame.cache() on intermediate results, or setting bigframes.options.compute.enable_multi_query_execution." + raise bigframes.exceptions.QueryComplexityError(new_message) from e + else: + raise + + def _with_cached_executions(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: + return tree_properties.replace_nodes(node, (dict(self._cached_executions))) + + def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): + """ + Can the block be evaluated very cheaply? + If True, the array_value probably is not worth caching. + """ + # Once rewriting is available, will want to rewrite before + # evaluating execution cost. + return tree_properties.is_trivially_executable( + self._with_cached_executions(array_value.node) + ) + + def _cache_with_cluster_cols( + self, array_value: bigframes.core.ArrayValue, cluster_cols: Sequence[str] + ): + """Executes the query and uses the resulting table to rewrite future executions.""" + + sql, schema, ordering_info = self.compiler.compile_raw( + self._with_cached_executions(array_value.node) + ) + tmp_table = self._sql_as_cached_temp_table( + sql, + schema, + cluster_cols=bq_io.select_cluster_cols(schema, cluster_cols), + ) + cached_replacement = array_value.as_cached( + cache_table=self.bqclient.get_table(tmp_table), + ordering=ordering_info, + ).node + self._cached_executions[array_value.node] = cached_replacement + + def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): + """Executes the query and uses the resulting table to rewrite future executions.""" + + if not self.strictly_ordered: + raise ValueError( + "Caching with offsets only supported in strictly ordered mode." + ) + offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") + node_w_offsets = array_value.promote_offsets(offset_column).node + sql = self.compiler.compile_unordered( + self._with_cached_executions(node_w_offsets) + ) + + tmp_table = self._sql_as_cached_temp_table( + sql, + node_w_offsets.schema.to_bigquery(), + cluster_cols=[offset_column], + ) + cached_replacement = array_value.as_cached( + cache_table=self.bqclient.get_table(tmp_table), + ordering=order.TotalOrdering.from_offset_col(offset_column), + ).node + self._cached_executions[array_value.node] = cached_replacement + + def _cache_with_session_awareness( + self, + array_value: bigframes.core.ArrayValue, + session_forest: Iterable[nodes.BigFrameNode], + ) -> None: + # These node types are cheap to re-compute + target, cluster_cols = bigframes.session.planner.session_aware_cache_plan( + array_value.node, list(session_forest) + ) + if len(cluster_cols) > 0: + self._cache_with_cluster_cols( + bigframes.core.ArrayValue(target), cluster_cols + ) + elif self.strictly_ordered: + self._cache_with_offsets(bigframes.core.ArrayValue(target)) + else: + self._cache_with_cluster_cols(bigframes.core.ArrayValue(target), []) + + def _simplify_with_caching(self, array_value: bigframes.core.ArrayValue): + """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" + # Apply existing caching first + for _ in range(MAX_SUBTREE_FACTORINGS): + node_with_cache = self._with_cached_executions(array_value.node) + if node_with_cache.planning_complexity < QUERY_COMPLEXITY_LIMIT: + return + + did_cache = self._cache_most_complex_subtree(array_value.node) + if not did_cache: + return + + def _cache_most_complex_subtree(self, node: nodes.BigFrameNode) -> bool: + # TODO: If query fails, retry with lower complexity limit + selection = tree_properties.select_cache_target( + node, + min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), + max_complexity=QUERY_COMPLEXITY_LIMIT, + cache=dict(self._cached_executions), + # Heuristic: subtree_compleixty * (copies of subtree)^2 + heuristic=lambda complexity, count: math.log(complexity) + + 2 * math.log(count), + ) + if selection is None: + # No good subtrees to cache, just return original tree + return False + + self._cache_with_cluster_cols(bigframes.core.ArrayValue(selection), []) + return True + + def _sql_as_cached_temp_table( + self, + sql: str, + schema: Sequence[bigquery.SchemaField], + cluster_cols: Sequence[str], + ) -> bigquery.TableReference: + assert len(cluster_cols) <= _MAX_CLUSTER_COLUMNS + temp_table = self.storage_manager.create_temp_table(schema, cluster_cols) + + # TODO: Get default job config settings + job_config = cast( + bigquery.QueryJobConfig, + bigquery.QueryJobConfig.from_api_repr({}), + ) + job_config.destination = temp_table + _, query_job = self._run_execute_query( + sql, + job_config=job_config, + api_name="cached", + ) + return query_job.destination diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py new file mode 100644 index 0000000000..85a7f6aa4b --- /dev/null +++ b/bigframes/session/metrics.py @@ -0,0 +1,88 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +import os +from typing import Optional, Tuple + +import google.cloud.bigquery as bigquery +import google.cloud.bigquery.job as bq_job + +LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" + + +@dataclasses.dataclass +class ExecutionMetrics: + execution_count: int = 0 + slot_millis: int = 0 + bytes_processed: int = 0 + + def count_job_stats(self, query_job: bq_job.QueryJob): + stats = get_performance_stats(query_job) + if stats is not None: + bytes_processed, slot_millis = stats + self.execution_count += 1 + self.bytes_processed += bytes_processed + self.slot_millis += slot_millis + if LOGGING_NAME_ENV_VAR in os.environ: + # when running notebooks via pytest nbmake + write_stats_to_disk(bytes_processed, slot_millis) + + +def get_performance_stats(query_job: bigquery.QueryJob) -> Optional[Tuple[int, int]]: + """Parse the query job for performance stats. + + Return None if the stats do not reflect real work done in bigquery. + """ + bytes_processed = query_job.total_bytes_processed + if not isinstance(bytes_processed, int): + return None # filter out mocks + if query_job.configuration.dry_run: + # dry run stats are just predictions of the real run + bytes_processed = 0 + + slot_millis = query_job.slot_millis + if not isinstance(slot_millis, int): + return None # filter out mocks + if query_job.configuration.dry_run: + # dry run stats are just predictions of the real run + slot_millis = 0 + + return bytes_processed, slot_millis + + +def write_stats_to_disk(bytes_processed: int, slot_millis: int): + """For pytest runs only, log information about the query job + to a file in order to create a performance report. + """ + if LOGGING_NAME_ENV_VAR not in os.environ: + raise EnvironmentError( + "Environment variable {env_var} is not set".format( + env_var=LOGGING_NAME_ENV_VAR + ) + ) + test_name = os.environ[LOGGING_NAME_ENV_VAR] + current_directory = os.getcwd() + + # store bytes processed + bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") + with open(bytes_file, "a") as f: + f.write(str(bytes_processed) + "\n") + + # store slot milliseconds + bytes_file = os.path.join(current_directory, test_name + ".slotmillis") + with open(bytes_file, "a") as f: + f.write(str(slot_millis) + "\n") diff --git a/bigframes/session/temp_storage.py b/bigframes/session/temp_storage.py new file mode 100644 index 0000000000..fb8c4bac7a --- /dev/null +++ b/bigframes/session/temp_storage.py @@ -0,0 +1,96 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +from typing import List, Optional, Sequence +import uuid + +import google.cloud.bigquery as bigquery + +import bigframes.constants as constants +import bigframes.session._io.bigquery as bf_io_bigquery + +_TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}" + + +class TemporaryGbqStorageManager: + """ + Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. + """ + + def __init__( + self, + bqclient: bigquery.Client, + dataset: bigquery.DatasetReference, + session_id: str, + *, + kms_key: Optional[str] = None + ): + self.bqclient = bqclient + self.dataset = dataset + self.session_id = session_id + self._table_ids: List[str] = [] + self._kms_key = kms_key + + def create_temp_table( + self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] + ) -> bigquery.TableReference: + # Can't set a table in _SESSION as destination via query job API, so we + # run DDL, instead. + expiration = ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) + table = bf_io_bigquery.create_temp_table( + self.bqclient, + self._random_table(), + expiration, + schema=schema, + cluster_columns=list(cluster_cols), + kms_key=self._kms_key, + ) + return bigquery.TableReference.from_string(table) + + def _random_table(self, skip_cleanup: bool = False) -> bigquery.TableReference: + """Generate a random table ID with BigQuery DataFrames prefix. + + The generated ID will be stored and checked for deletion when the + session is closed, unless skip_cleanup is True. + + Args: + skip_cleanup (bool, default False): + If True, do not add the generated ID to the list of tables + to clean up when the session is closed. + + Returns: + google.cloud.bigquery.TableReference: + Fully qualified table ID of a table that doesn't exist. + """ + now = datetime.datetime.now(datetime.timezone.utc) + random_id = uuid.uuid4().hex + table_id = _TEMP_TABLE_ID_FORMAT.format( + date=now.strftime("%Y%m%d"), session_id=self.session_id, random_id=random_id + ) + if not skip_cleanup: + self._table_ids.append(table_id) + return self.dataset.table(table_id) + + def clean_up_tables(self): + """Delete tables that were created with this session's session_id.""" + client = self.bqclient + project_id = self.dataset.project + dataset_id = self.dataset.dataset_id + + for table_id in self._table_ids: + full_id = ".".join([project_id, dataset_id, table_id]) + client.delete_table(full_id, not_found_ok=True) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index 2b82d0133b..9f42c4ae94 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -59,7 +59,7 @@ def test_read_gbq_for_large_tables( assert len(df.columns) != 0 -def test_close(session): +def test_close(session: bigframes.Session): # we will create two tables and confirm that they are deleted # when the session is closed @@ -69,8 +69,12 @@ def test_close(session): datetime.datetime.now(datetime.timezone.utc) + bigframes.constants.DEFAULT_EXPIRATION ) - full_id_1 = bigframes.session._io.bigquery.create_temp_table(session, expiration) - full_id_2 = bigframes.session._io.bigquery.create_temp_table(session, expiration) + full_id_1 = bigframes.session._io.bigquery.create_temp_table( + session.bqclient, session._temp_storage_manager._random_table(), expiration + ) + full_id_2 = bigframes.session._io.bigquery.create_temp_table( + session.bqclient, session._temp_storage_manager._random_table(), expiration + ) # check that the tables were actually created assert bqclient.get_table(full_id_1).created is not None @@ -101,8 +105,12 @@ def test_clean_up_by_session_id(): datetime.datetime.now(datetime.timezone.utc) + bigframes.constants.DEFAULT_EXPIRATION ) - bigframes.session._io.bigquery.create_temp_table(session, expiration) - bigframes.session._io.bigquery.create_temp_table(session, expiration) + bigframes.session._io.bigquery.create_temp_table( + session.bqclient, session._temp_storage_manager._random_table(), expiration + ) + bigframes.session._io.bigquery.create_temp_table( + session.bqclient, session._temp_storage_manager._random_table(), expiration + ) # check that some table exists with the expected session_id tables_before = bqclient.list_tables( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index d838251dca..e1644c20b4 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2179,10 +2179,10 @@ def test_binop_with_self_aggregate(session, scalars_dfs): df_columns = ["int64_col", "float64_col", "int64_too"] # Ensure that this takes the optimized single-query path by counting executions - execution_count_before = session._execution_count + execution_count_before = session._metrics.execution_count bf_df = scalars_df[df_columns] bf_result = (bf_df - bf_df.mean()).to_pandas() - execution_count_after = session._execution_count + execution_count_after = session._metrics.execution_count pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 428a6a28bf..65a98b014d 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -86,10 +86,10 @@ def test_session_load_job(bq_cmek, session_with_bq_cmek): pytest.skip("no cmek set for testing") # pragma: NO COVER # Session should have cmek set in the default query and load job configs - load_table = session_with_bq_cmek._random_table() + load_table = session_with_bq_cmek._temp_storage_manager._random_table() df = pandas.DataFrame({"col0": [1, 2, 3]}) - load_job_config = session_with_bq_cmek._prepare_load_job_config() + load_job_config = bigquery.LoadJobConfig() load_job_config.schema = [ bigquery.SchemaField(df.columns[0], bigquery.enums.SqlTypeNames.INT64) ] @@ -186,7 +186,7 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): # Write the result to BQ custom table and assert encryption session_with_bq_cmek.bqclient.get_table(output_table_id) - output_table_ref = session_with_bq_cmek._random_table() + output_table_ref = session_with_bq_cmek._temp_storage_manager._random_table() output_table_id = str(output_table_ref) df.to_gbq(output_table_id) output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2f779f337e..5b5db74ea6 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -556,7 +556,8 @@ def test_read_gbq_with_custom_global_labels( bigframes.options.compute.assign_extra_query_labels(test1=1, test2="abc") bigframes.options.compute.extra_query_labels["test3"] = False - job_labels = session.read_gbq(scalars_table_id).query_job.labels # type:ignore + query_job = session.read_gbq(scalars_table_id).query_job + job_labels = query_job.labels # type:ignore expected_labels = {"test1": "1", "test2": "abc", "test3": "false"} # All jobs should include a bigframes-api key. See internal issue 336521938. diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 1d6be3dff8..46c3c92036 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -155,7 +155,12 @@ def test_create_temp_table_default_expiration(): ) session = resources.create_bigquery_session() - bigframes.session._io.bigquery.create_temp_table(session, expiration) + table_ref = bigquery.TableReference.from_string( + "test-project.test_dataset.bqdf_new_random_table" + ) + bigframes.session._io.bigquery.create_temp_table( + session.bqclient, table_ref, expiration + ) session.bqclient.create_table.assert_called_once() call_args = session.bqclient.create_table.call_args From 6bc6a41426fbbb60e77cd77f80860f88a1751a4b Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:14:10 -0700 Subject: [PATCH 3/7] feat: add llm.TextEmbeddingGenerator to support new embedding models (#905) * feat: add llm.TextEmbeddingGenerator to support new embedding models * fix docs --- bigframes/ml/llm.py | 166 +++++++++++++++++++++++++++++- bigframes/ml/loader.py | 3 + tests/system/small/ml/test_llm.py | 41 ++++++++ 3 files changed, 207 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 2517178d89..45634423c6 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -40,11 +40,18 @@ _EMBEDDING_GENERATOR_GECKO_ENDPOINT = "textembedding-gecko" _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT = "textembedding-gecko-multilingual" -_EMBEDDING_GENERATOR_ENDPOINTS = ( +_PALM2_EMBEDDING_GENERATOR_ENDPOINTS = ( _EMBEDDING_GENERATOR_GECKO_ENDPOINT, _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, ) +_TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004" +_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002" +_TEXT_EMBEDDING_ENDPOINTS = ( + _TEXT_EMBEDDING_004_ENDPOINT, + _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT, +) + _GEMINI_PRO_ENDPOINT = "gemini-pro" _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" @@ -57,6 +64,7 @@ _ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" _ML_EMBED_TEXT_STATUS = "ml_embed_text_status" +_ML_GENERATE_EMBEDDING_STATUS = "ml_generate_embedding_status" @log_adapter.class_logger @@ -387,6 +395,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: class PaLM2TextEmbeddingGenerator(base.BaseEstimator): """PaLM2 text embedding generator LLM model. + .. note:: + Models in this class are outdated and going to be deprecated. To use the most updated text embedding models, go to the TextEmbeddingGenerator class. + + Args: model_name (str, Default to "textembedding-gecko"): The model for text embedding. “textembedding-gecko” returns model embeddings for text inputs. @@ -447,9 +459,9 @@ def _create_bqml_model(self): iam_role="aiplatform.user", ) - if self.model_name not in _EMBEDDING_GENERATOR_ENDPOINTS: + if self.model_name not in _PALM2_EMBEDDING_GENERATOR_ENDPOINTS: raise ValueError( - f"Model name {self.model_name} is not supported. We only support {', '.join(_EMBEDDING_GENERATOR_ENDPOINTS)}." + f"Model name {self.model_name} is not supported. We only support {', '.join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS)}." ) endpoint = ( @@ -551,6 +563,154 @@ def to_gbq( return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger +class TextEmbeddingGenerator(base.BaseEstimator): + """Text embedding generator LLM model. + + Args: + model_name (str, Default to "text-embedding-004"): + The model for text embedding. Possible values are "text-embedding-004" or "text-multilingual-embedding-002". + text-embedding models returns model embeddings for text inputs. + text-multilingual-embedding models returns model embeddings for text inputs which support over 100 languages. + Default to "text-embedding-004". + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + If None, use default connection in session context. + """ + + def __init__( + self, + *, + model_name: Literal[ + "text-embedding-004", "text-multilingual-embedding-002" + ] = "text-embedding-004", + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.model_name = model_name + self.session = session or bpd.get_global_session() + self._bq_connection_manager = self.session.bqconnectionmanager + + connection_name = connection_name or self.session._bq_connection + self.connection_name = clients.resolve_full_bq_connection_name( + connection_name, + default_project=self.session._project, + default_location=self.session._location, + ) + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + + if self._bq_connection_manager: + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + + if self.model_name not in _TEXT_EMBEDDING_ENDPOINTS: + raise ValueError( + f"Model name {self.model_name} is not supported. We only support {', '.join(_TEXT_EMBEDDING_ENDPOINTS)}." + ) + + options = { + "endpoint": self.model_name, + } + return self._bqml_model_factory.create_remote_model( + session=self.session, connection_name=self.connection_name, options=options + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> TextEmbeddingGenerator: + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + model = cls( + session=session, + model_name=model_endpoint, # type: ignore + connection_name=model_connection, + ) + + model._bqml_model = core.BqmlModel(session, bq_model) + return model + + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + + # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models + (X,) = utils.convert_to_dataframe(X) + + if len(X.columns) != 1: + raise ValueError( + f"Only support one column as input. {constants.FEEDBACK_LINK}" + ) + + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "content"}) + + options = { + "flatten_json_output": True, + } + + df = self._bqml_model.generate_embedding(X, options) + + if (df[_ML_GENERATE_EMBEDDING_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_GENERATE_EMBEDDING_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df + + def to_gbq(self, model_name: str, replace: bool = False) -> TextEmbeddingGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Determine whether to replace if the model already exists. Default to False. + + Returns: + TextEmbeddingGenerator: Saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + @log_adapter.class_logger class GeminiTextGenerator(base.BaseEstimator): """Gemini text generator LLM model. diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 515fb50c6f..bd01342152 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -63,6 +63,8 @@ llm._GEMINI_PRO_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, + llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator, + llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator, } ) @@ -84,6 +86,7 @@ def from_bq( imported.XGBoostModel, llm.PaLM2TextGenerator, llm.PaLM2TextEmbeddingGenerator, + llm.TextEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, preprocessing.PreprocessingType, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b926004fd8..c2f62096d0 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -304,6 +304,47 @@ def test_embedding_generator_predict_series_success( assert len(value) == 768 +@pytest.mark.parametrize( + "model_name", + ("text-embedding-004", "text-multilingual-embedding-002"), +) +def test_create_load_text_embedding_generator_model( + dataset_id, model_name, session, bq_connection +): + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + assert text_embedding_model is not None + assert text_embedding_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = text_embedding_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + assert reloaded_model.model_name == model_name + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-004", "text-multilingual-embedding-002"), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_embedding_generator_predict_default_params_success( + llm_text_df, model_name, session, bq_connection +): + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + df = text_embedding_model.predict(llm_text_df).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_embedding_result" in df.columns + series = df["ml_generate_embedding_result"] + value = series[0] + assert len(value) == 768 + + @pytest.mark.parametrize( "model_name", ("gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514"), From d9b8ef56deb0c776edeeb0112bd9d35d5ed1b70e Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 19 Aug 2024 17:32:58 -0700 Subject: [PATCH 4/7] docs: update embedding model notebooks (#906) * feat: add llm.TextEmbeddingGenerator to support new embedding models * fix docs * docs: update embedding notebooks --- .../bq_dataframes_llm_code_generation.ipynb | 4 +- .../bq_dataframes_llm_kmeans.ipynb | 1091 +++++++++-------- 2 files changed, 553 insertions(+), 542 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 4ea766604d..c0c3c58a3c 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -89,7 +89,7 @@ "\n", "The steps include:\n", "\n", - "- Defining an LLM model in BigQuery DataFrames, specifically the [`text-bison` model of the PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text), using `bigframes.ml.llm`.\n", + "- Defining an LLM model in BigQuery DataFrames, specifically the [Gemini Model](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models), using `bigframes.ml.llm`.\n", "- Creating a DataFrame by reading in data from Cloud Storage.\n", "- Manipulating data in the DataFrame to build LLM prompts.\n", "- Sending DataFrame prompts to the LLM model using the `predict` method.\n", @@ -385,7 +385,7 @@ "source": [ "# Define the LLM model\n", "\n", - "BigQuery DataFrames provides integration with [`text-bison` model of the PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text) via Vertex AI.\n", + "BigQuery DataFrames provides integration with [Gemini Models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models) via Vertex AI.\n", "\n", "This section walks through a few steps required in order to use the model in your notebook." ] diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index ab6fd93f9a..d49a44a780 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -57,9 +57,9 @@ "source": [ "## Overview\n", "\n", - "The goal of this notebook is to demonstrate a comment characterization algorithm for an online business. We will accomplish this using [Google's PaLM 2](https://ai.google/discover/palm2/) and [KMeans clustering](https://en.wikipedia.org/wiki/K-means_clustering) in three steps:\n", + "The goal of this notebook is to demonstrate a comment characterization algorithm for an online business. We will accomplish this using [Google's Embedding Models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#models) and [KMeans clustering](https://en.wikipedia.org/wiki/K-means_clustering) in three steps:\n", "\n", - "1. Use PaLM2TextEmbeddingGenerator to [generate text embeddings](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) for each of 10000 complaints sent to an online bank. If you're not familiar with what a text embedding is, it's a list of numbers that are like coordinates in an imaginary \"meaning space\" for sentences. (It's like [word embeddings](https://en.wikipedia.org/wiki/Word_embedding), but for more general text.) The important point for our purposes is that similar sentences are close to each other in this imaginary space.\n", + "1. Use TextEmbeddingGenerator to [generate text embeddings](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) for each of 10000 complaints sent to an online bank. If you're not familiar with what a text embedding is, it's a list of numbers that are like coordinates in an imaginary \"meaning space\" for sentences. (It's like [word embeddings](https://en.wikipedia.org/wiki/Word_embedding), but for more general text.) The important point for our purposes is that similar sentences are close to each other in this imaginary space.\n", "2. Use KMeans clustering to group together complaints whose text embeddings are near to eachother. This will give us sets of similar complaints, but we don't yet know _why_ these complaints are similar.\n", "3. Prompt GeminiTextGenerator in English asking what the difference is between the groups of complaints that we got. Thanks to the power of modern LLMs, the response might give us a very good idea of what these complaints are all about, but remember to [\"understand the limits of your dataset and model.\"](https://ai.google/responsibility/responsible-ai-practices/#:~:text=Understand%20the%20limitations%20of%20your%20dataset%20and%20model)\n", "\n", @@ -289,7 +289,7 @@ { "data": { "text/html": [ - "Query job 952b852e-7cf0-493d-8258-fe60daf45ebf is DONE. 2.3 GB processed. Open Job" + "Query job 960f637d-89eb-4bbf-a34c-36ed624e8e9a is DONE. 2.3 GB processed. Open Job" ], "text/plain": [ "" @@ -301,7 +301,7 @@ { "data": { "text/html": [ - "Query job f9939880-6c66-4da5-9e90-daf8d9a9d83c is DONE. 50.3 MB processed. Open Job" + "Query job 59bb207c-98e1-4dab-8686-320f276b09df is DONE. 63.7 MB processed. Open Job" ], "text/plain": [ "" @@ -336,24 +336,24 @@ " \n", " \n", " \n", - " 1799560\n", - " Thursday, XX/XX/XXXX, unauthorized charges wer...\n", + " 2557016\n", + " I've been disputing fraud accounts on my credi...\n", " \n", " \n", - " 1800272\n", - " The credit reporting company is reporting inac...\n", + " 2557686\n", + " American Express Platinum totally messed up my...\n", " \n", " \n", - " 1800409\n", - " In accordance with the Fair Credit Reporting a...\n", + " 2558170\n", + " I recently looked at my credit report and noti...\n", " \n", " \n", - " 1800550\n", - " I told the credit bureaus to \" investigate eve...\n", + " 2558545\n", + " Select Portfolio Servicing contacted my insura...\n", " \n", " \n", - " 1800818\n", - " Im writing in reference regarding XXXXXXXX XXX...\n", + " 2558652\n", + " I checked my credit report and I am upset on w...\n", " \n", " \n", "\n", @@ -361,11 +361,11 @@ ], "text/plain": [ " consumer_complaint_narrative\n", - "1799560 Thursday, XX/XX/XXXX, unauthorized charges wer...\n", - "1800272 The credit reporting company is reporting inac...\n", - "1800409 In accordance with the Fair Credit Reporting a...\n", - "1800550 I told the credit bureaus to \" investigate eve...\n", - "1800818 Im writing in reference regarding XXXXXXXX XXX..." + "2557016 I've been disputing fraud accounts on my credi...\n", + "2557686 American Express Platinum totally messed up my...\n", + "2558170 I recently looked at my credit report and noti...\n", + "2558545 Select Portfolio Servicing contacted my insura...\n", + "2558652 I checked my credit report and I am upset on w..." ] }, "execution_count": 7, @@ -418,7 +418,7 @@ { "data": { "text/html": [ - "Query job e3ff0549-f0ee-4508-bb4f-beea14bf54f5 is DONE. 0 Bytes processed. Open Job" + "Query job e4616b5e-b4c0-490c-a249-484f373f89d9 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -429,9 +429,9 @@ } ], "source": [ - "from bigframes.ml.llm import PaLM2TextEmbeddingGenerator\n", + "from bigframes.ml.llm import TextEmbeddingGenerator\n", "\n", - "model = PaLM2TextEmbeddingGenerator() # No connection id needed" + "model = TextEmbeddingGenerator() # No connection id needed" ] }, { @@ -444,7 +444,7 @@ { "data": { "text/html": [ - "Query job 5b3d8f8c-9e8d-4378-b4df-e3328300f17a is DONE. 1.3 GB processed. Open Job" + "Query job 89f96e88-2dd5-4326-8912-925b237e2877 is DONE. 1.3 GB processed. Open Job" ], "text/plain": [ "" @@ -454,21 +454,17 @@ "output_type": "display_data" }, { - "data": { - "text/html": [ - "Query job f35c2982-4953-45fa-84bd-d0ce04e13c5e is DONE. 80.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/__init__.py:108: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] }, { "data": { "text/html": [ - "Query job b70c55a3-b18b-4313-86b0-31f5b3b570fb is DONE. 20.0 kB processed. Open Job" + "Query job bcdbfe96-2cce-4269-81f4-0334033b458b is DONE. 20.0 kB processed. Open Job" ], "text/plain": [ "" @@ -480,7 +476,7 @@ { "data": { "text/html": [ - "Query job 2b2cfd9f-c713-4411-a3ca-1916cec84ff0 is DONE. 0 Bytes processed. Open Job" + "Query job 3b89850f-4491-4343-912a-7a2fd3137790 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -492,7 +488,7 @@ { "data": { "text/html": [ - "Query job 09cadae1-1c66-43cf-a76f-7495b0123006 is DONE. 71.9 MB processed. Open Job" + "Query job a2999e90-8d14-4f4a-99dc-4e769df01837 is DONE. 72.0 MB processed. Open Job" ], "text/plain": [ "" @@ -522,187 +518,188 @@ " \n", " \n", " \n", - " text_embedding\n", - " statistics\n", - " ml_embed_text_status\n", + " ml_generate_embedding_result\n", + " ml_generate_embedding_statistics\n", + " ml_generate_embedding_status\n", " content\n", " \n", " \n", " \n", " \n", - " 782\n", - " [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-...\n", - " {\"token_count\":121,\"truncated\":false}\n", + " 415\n", + " [ 2.56774724e-02 -1.06168222e-02 3.06945704e-...\n", + " {\"token_count\":171,\"truncated\":false}\n", " \n", - " I 've sent multiple letters to this agency abo...\n", + " DEPT OF EDUCATION/XXXX is stating I was late ...\n", " \n", " \n", - " 795\n", - " [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-...\n", - " {\"token_count\":141,\"truncated\":false}\n", + " 596\n", + " [ 5.90653270e-02 -9.31344274e-03 -7.12460047e-...\n", + " {\"token_count\":668,\"truncated\":false}\n", " \n", - " I receive social security XXXX funds in my XXX...\n", + " I alerted my credit card company XX/XX/2017 th...\n", " \n", " \n", - " 861\n", - " [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-...\n", - " {\"token_count\":160,\"truncated\":false}\n", + " 706\n", + " [ 0.01298233 0.00130001 0.01800315 0.037078...\n", + " {\"token_count\":252,\"truncated\":false}\n", " \n", - " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", + " Sallie mae is corrupt. \n", + "I have tried to talk t...\n", " \n", " \n", - " 1103\n", - " [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-...\n", - " {\"token_count\":31,\"truncated\":false}\n", + " 804\n", + " [-1.39777679e-02 1.68943349e-02 5.53999236e-...\n", + " {\"token_count\":412,\"truncated\":false}\n", " \n", - " The debt occurred more than 7 years in the pas...\n", + " In accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 1241\n", - " [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-...\n", - " {\"token_count\":23,\"truncated\":false}\n", + " 861\n", + " [ 2.33309343e-02 -2.36528926e-03 3.37129943e-...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA...\n", + " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", " \n", " \n", - " 1729\n", - " [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-...\n", - " {\"token_count\":382,\"truncated\":false}\n", + " 1030\n", + " [ 0.06060313 -0.06495965 -0.03605044 -0.028016...\n", + " {\"token_count\":298,\"truncated\":false}\n", " \n", - " XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort...\n", + " Hello, I would like to complain about PayPal H...\n", " \n", " \n", - " 2167\n", - " [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-...\n", - " {\"token_count\":556,\"truncated\":false}\n", + " 1582\n", + " [ 0.01255985 -0.01652482 -0.02638046 0.036858...\n", + " {\"token_count\":814,\"truncated\":false}\n", " \n", - " This is the third such complaint I have submit...\n", + " Transunion is listing personal information ( n...\n", " \n", " \n", - " 2219\n", - " [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-...\n", - " {\"token_count\":196,\"truncated\":false}\n", + " 1600\n", + " [ 5.13355099e-02 4.01246967e-03 5.72342947e-...\n", + " {\"token_count\":653,\"truncated\":false}\n", " \n", - " Found and add online for a Prepaid Credit card...\n", + " On XX/XX/XXXX, I called Citizen Bank at XXXX t...\n", " \n", " \n", - " 2392\n", - " [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-...\n", - " {\"token_count\":641,\"truncated\":false}\n", + " 2060\n", + " [ 6.44792162e-04 4.95899878e-02 4.67925966e-...\n", + " {\"token_count\":136,\"truncated\":false}\n", " \n", - " I am furnishing this complaint against Fed Loa...\n", + " Theses names are the known liars that I have s...\n", " \n", " \n", - " 2528\n", - " [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-...\n", - " {\"token_count\":176,\"truncated\":false}\n", + " 2283\n", + " [ 4.71848622e-02 -8.68239347e-03 5.80501892e-...\n", + " {\"token_count\":478,\"truncated\":false}\n", " \n", - " Despite multiple written requests, the unverif...\n", + " My house was hit by a tree XX/XX/2018. My insu...\n", " \n", " \n", - " 2737\n", - " [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-...\n", - " {\"token_count\":230,\"truncated\":false}\n", + " 2421\n", + " [-2.90394691e-03 -1.81679502e-02 -7.99657404e-...\n", + " {\"token_count\":389,\"truncated\":false}\n", " \n", - " After unsatisfying communication in the messag...\n", + " I became aware of a credit inquiry on my XXXX...\n", " \n", " \n", - " 2859\n", - " [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-...\n", - " {\"token_count\":238,\"truncated\":false}\n", + " 2422\n", + " [-6.70500053e-03 1.51133696e-02 4.94448021e-...\n", + " {\"token_count\":124,\"truncated\":false}\n", " \n", - " Good Morning. My name is XXXX XXXX. My account...\n", + " I have sent numerous letters, police reports a...\n", " \n", " \n", - " 3439\n", - " [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-...\n", - " {\"token_count\":197,\"truncated\":false}\n", + " 2658\n", + " [ 6.70989677e-02 -3.53626162e-02 1.08648362e-...\n", + " {\"token_count\":762,\"truncated\":false}\n", " \n", - " I have ongoing disputes that are preventing me...\n", + " This letter concerns two disputes ( chargeback...\n", " \n", " \n", - " 3738\n", - " [ 0.01422119 -0.01114973 -0.04438976 -0.024421...\n", - " {\"token_count\":160,\"truncated\":false}\n", + " 2883\n", + " [-1.28255319e-02 -1.89735275e-02 5.68657108e-...\n", + " {\"token_count\":71,\"truncated\":false}\n", " \n", - " I had a loan with national Collegiate Trust. i...\n", + " It is very frustrating that this has been goin...\n", " \n", " \n", - " 3805\n", - " [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-...\n", - " {\"token_count\":477,\"truncated\":false}\n", + " 2951\n", + " [ 3.23301251e-03 -2.61142217e-02 1.31891826e-...\n", + " {\"token_count\":95,\"truncated\":false}\n", " \n", - " Hi I am submitting this XXXX XXXX this isn't a...\n", + " I, the consumer, in fact, have a right to priv...\n", " \n", " \n", - " 3915\n", - " [-7.23852217e-03 -4.69538383e-02 -5.60489520e-...\n", - " {\"token_count\":116,\"truncated\":false}\n", + " 2992\n", + " [-2.22910382e-03 -1.07050659e-02 4.74211425e-...\n", + " {\"token_count\":407,\"truncated\":false}\n", " \n", - " portfolio is showin on my credit report with a...\n", + " XXXX XXXX XXXX should not be reporting to Expe...\n", " \n", " \n", - " 3917\n", - " [-8.92711710e-03 -4.49132621e-02 -4.29662578e-...\n", - " {\"token_count\":71,\"truncated\":false}\n", + " 3969\n", + " [ 1.58297736e-02 3.01055871e-02 5.60088176e-...\n", + " {\"token_count\":287,\"truncated\":false}\n", " \n", - " the company shared my information with another...\n", + " DEAR CFPB ; XXXX ; XXXX ; AND TRANSUNION ; SEE...\n", " \n", " \n", - " 4281\n", - " [-1.69487391e-02 -1.89835522e-02 -3.80971469e-...\n", - " {\"token_count\":130,\"truncated\":false}\n", + " 4087\n", + " [ 1.99207035e-03 -7.62321474e-03 7.92114343e-...\n", + " {\"token_count\":88,\"truncated\":false}\n", " \n", - " I tried to submit a teacher loan forgiveness a...\n", + " This debt was from my identity being stolen I ...\n", " \n", " \n", - " 4470\n", - " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", - " {\"token_count\":200,\"truncated\":false}\n", + " 4326\n", + " [ 3.44273262e-02 -3.36350128e-02 1.91939529e-...\n", + " {\"token_count\":52,\"truncated\":false}\n", " \n", - " in accordance with the Fair Credit Reporting a...\n", + " The items that are reflected on my credit repo...\n", " \n", " \n", - " 4915\n", - " [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-...\n", - " {\"token_count\":23,\"truncated\":false}\n", + " 4682\n", + " [ 2.47727744e-02 -1.77769139e-02 4.63737026e-...\n", + " {\"token_count\":284,\"truncated\":false}\n", " \n", - " XXXX XXXX did not give me a receipt or a copy ...\n", + " I filed for chapter XXXX bankruptcy on XXXX...\n", " \n", " \n", - " 4928\n", - " [-4.43694415e-03 -3.66776163e-04 -9.08496231e-...\n", - " {\"token_count\":83,\"truncated\":false}\n", + " 5005\n", + " [ 2.51834448e-02 -4.92606424e-02 -1.37688573e-...\n", + " {\"token_count\":17,\"truncated\":false}\n", " \n", - " This company has filed a civil suit during a g...\n", + " There are 2 Inquires on my credit report that ...\n", " \n", " \n", - " 5338\n", - " [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-...\n", - " {\"token_count\":1279,\"truncated\":false}\n", + " 5144\n", + " [ 3.26358266e-02 -3.67171178e-03 3.65621522e-...\n", + " {\"token_count\":105,\"truncated\":false}\n", " \n", - " My credit report contains errors that is keepi...\n", + " My mortgage was sold from XXXX XXXX to freed...\n", " \n", " \n", - " 5582\n", - " [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-...\n", - " {\"token_count\":396,\"truncated\":false}\n", + " 6090\n", + " [ 2.47520711e-02 1.09149124e-02 1.35175223e-...\n", + " {\"token_count\":545,\"truncated\":false}\n", " \n", - " Coast Professional, XXXX, LA contacted me by m...\n", + " On XX/XX/XXXX this company received certified...\n", " \n", " \n", - " 6386\n", - " [ 3.33276950e-02 1.53224478e-02 -1.89354066e-...\n", - " {\"token_count\":79,\"truncated\":false}\n", + " 6449\n", + " [ 1.86854266e-02 1.31238240e-03 -4.96791191e-...\n", + " {\"token_count\":104,\"truncated\":false}\n", " \n", - " Cares act refund requested in XXXX, called mul...\n", + " After hours on the phone with multiple agents,...\n", " \n", " \n", - " 6956\n", - " [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-...\n", - " {\"token_count\":194,\"truncated\":false}\n", + " 6486\n", + " [ 1.56347770e-02 2.23377198e-02 -1.32683543e-...\n", + " {\"token_count\":211,\"truncated\":false}\n", " \n", - " n accordance with the Fair Credit Reporting ac...\n", + " On XX/XX/2019 two charges one for XXXX and one...\n", " \n", " \n", "\n", @@ -710,86 +707,87 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "782 [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-... \n", - "795 [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-... \n", - "861 [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-... \n", - "1103 [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-... \n", - "1241 [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-... \n", - "1729 [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-... \n", - "2167 [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-... \n", - "2219 [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-... \n", - "2392 [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-... \n", - "2528 [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-... \n", - "2737 [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-... \n", - "2859 [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-... \n", - "3439 [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-... \n", - "3738 [ 0.01422119 -0.01114973 -0.04438976 -0.024421... \n", - "3805 [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-... \n", - "3915 [-7.23852217e-03 -4.69538383e-02 -5.60489520e-... \n", - "3917 [-8.92711710e-03 -4.49132621e-02 -4.29662578e-... \n", - "4281 [-1.69487391e-02 -1.89835522e-02 -3.80971469e-... \n", - "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", - "4915 [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-... \n", - "4928 [-4.43694415e-03 -3.66776163e-04 -9.08496231e-... \n", - "5338 [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-... \n", - "5582 [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-... \n", - "6386 [ 3.33276950e-02 1.53224478e-02 -1.89354066e-... \n", - "6956 [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-... \n", + " ml_generate_embedding_result \\\n", + "415 [ 2.56774724e-02 -1.06168222e-02 3.06945704e-... \n", + "596 [ 5.90653270e-02 -9.31344274e-03 -7.12460047e-... \n", + "706 [ 0.01298233 0.00130001 0.01800315 0.037078... \n", + "804 [-1.39777679e-02 1.68943349e-02 5.53999236e-... \n", + "861 [ 2.33309343e-02 -2.36528926e-03 3.37129943e-... \n", + "1030 [ 0.06060313 -0.06495965 -0.03605044 -0.028016... \n", + "1582 [ 0.01255985 -0.01652482 -0.02638046 0.036858... \n", + "1600 [ 5.13355099e-02 4.01246967e-03 5.72342947e-... \n", + "2060 [ 6.44792162e-04 4.95899878e-02 4.67925966e-... \n", + "2283 [ 4.71848622e-02 -8.68239347e-03 5.80501892e-... \n", + "2421 [-2.90394691e-03 -1.81679502e-02 -7.99657404e-... \n", + "2422 [-6.70500053e-03 1.51133696e-02 4.94448021e-... \n", + "2658 [ 6.70989677e-02 -3.53626162e-02 1.08648362e-... \n", + "2883 [-1.28255319e-02 -1.89735275e-02 5.68657108e-... \n", + "2951 [ 3.23301251e-03 -2.61142217e-02 1.31891826e-... \n", + "2992 [-2.22910382e-03 -1.07050659e-02 4.74211425e-... \n", + "3969 [ 1.58297736e-02 3.01055871e-02 5.60088176e-... \n", + "4087 [ 1.99207035e-03 -7.62321474e-03 7.92114343e-... \n", + "4326 [ 3.44273262e-02 -3.36350128e-02 1.91939529e-... \n", + "4682 [ 2.47727744e-02 -1.77769139e-02 4.63737026e-... \n", + "5005 [ 2.51834448e-02 -4.92606424e-02 -1.37688573e-... \n", + "5144 [ 3.26358266e-02 -3.67171178e-03 3.65621522e-... \n", + "6090 [ 2.47520711e-02 1.09149124e-02 1.35175223e-... \n", + "6449 [ 1.86854266e-02 1.31238240e-03 -4.96791191e-... \n", + "6486 [ 1.56347770e-02 2.23377198e-02 -1.32683543e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "782 {\"token_count\":121,\"truncated\":false} \n", - "795 {\"token_count\":141,\"truncated\":false} \n", - "861 {\"token_count\":160,\"truncated\":false} \n", - "1103 {\"token_count\":31,\"truncated\":false} \n", - "1241 {\"token_count\":23,\"truncated\":false} \n", - "1729 {\"token_count\":382,\"truncated\":false} \n", - "2167 {\"token_count\":556,\"truncated\":false} \n", - "2219 {\"token_count\":196,\"truncated\":false} \n", - "2392 {\"token_count\":641,\"truncated\":false} \n", - "2528 {\"token_count\":176,\"truncated\":false} \n", - "2737 {\"token_count\":230,\"truncated\":false} \n", - "2859 {\"token_count\":238,\"truncated\":false} \n", - "3439 {\"token_count\":197,\"truncated\":false} \n", - "3738 {\"token_count\":160,\"truncated\":false} \n", - "3805 {\"token_count\":477,\"truncated\":false} \n", - "3915 {\"token_count\":116,\"truncated\":false} \n", - "3917 {\"token_count\":71,\"truncated\":false} \n", - "4281 {\"token_count\":130,\"truncated\":false} \n", - "4470 {\"token_count\":200,\"truncated\":false} \n", - "4915 {\"token_count\":23,\"truncated\":false} \n", - "4928 {\"token_count\":83,\"truncated\":false} \n", - "5338 {\"token_count\":1279,\"truncated\":false} \n", - "5582 {\"token_count\":396,\"truncated\":false} \n", - "6386 {\"token_count\":79,\"truncated\":false} \n", - "6956 {\"token_count\":194,\"truncated\":false} \n", + " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", + "415 {\"token_count\":171,\"truncated\":false} \n", + "596 {\"token_count\":668,\"truncated\":false} \n", + "706 {\"token_count\":252,\"truncated\":false} \n", + "804 {\"token_count\":412,\"truncated\":false} \n", + "861 {\"token_count\":160,\"truncated\":false} \n", + "1030 {\"token_count\":298,\"truncated\":false} \n", + "1582 {\"token_count\":814,\"truncated\":false} \n", + "1600 {\"token_count\":653,\"truncated\":false} \n", + "2060 {\"token_count\":136,\"truncated\":false} \n", + "2283 {\"token_count\":478,\"truncated\":false} \n", + "2421 {\"token_count\":389,\"truncated\":false} \n", + "2422 {\"token_count\":124,\"truncated\":false} \n", + "2658 {\"token_count\":762,\"truncated\":false} \n", + "2883 {\"token_count\":71,\"truncated\":false} \n", + "2951 {\"token_count\":95,\"truncated\":false} \n", + "2992 {\"token_count\":407,\"truncated\":false} \n", + "3969 {\"token_count\":287,\"truncated\":false} \n", + "4087 {\"token_count\":88,\"truncated\":false} \n", + "4326 {\"token_count\":52,\"truncated\":false} \n", + "4682 {\"token_count\":284,\"truncated\":false} \n", + "5005 {\"token_count\":17,\"truncated\":false} \n", + "5144 {\"token_count\":105,\"truncated\":false} \n", + "6090 {\"token_count\":545,\"truncated\":false} \n", + "6449 {\"token_count\":104,\"truncated\":false} \n", + "6486 {\"token_count\":211,\"truncated\":false} \n", "\n", " content \n", - "782 I 've sent multiple letters to this agency abo... \n", - "795 I receive social security XXXX funds in my XXX... \n", + "415 DEPT OF EDUCATION/XXXX is stating I was late ... \n", + "596 I alerted my credit card company XX/XX/2017 th... \n", + "706 Sallie mae is corrupt. \n", + "I have tried to talk t... \n", + "804 In accordance with the Fair Credit Reporting a... \n", "861 Hello, My name is XXXX XXXX XXXX. I have a pro... \n", - "1103 The debt occurred more than 7 years in the pas... \n", - "1241 UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA... \n", - "1729 XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort... \n", - "2167 This is the third such complaint I have submit... \n", - "2219 Found and add online for a Prepaid Credit card... \n", - "2392 I am furnishing this complaint against Fed Loa... \n", - "2528 Despite multiple written requests, the unverif... \n", - "2737 After unsatisfying communication in the messag... \n", - "2859 Good Morning. My name is XXXX XXXX. My account... \n", - "3439 I have ongoing disputes that are preventing me... \n", - "3738 I had a loan with national Collegiate Trust. i... \n", - "3805 Hi I am submitting this XXXX XXXX this isn't a... \n", - "3915 portfolio is showin on my credit report with a... \n", - "3917 the company shared my information with another... \n", - "4281 I tried to submit a teacher loan forgiveness a... \n", - "4470 in accordance with the Fair Credit Reporting a... \n", - "4915 XXXX XXXX did not give me a receipt or a copy ... \n", - "4928 This company has filed a civil suit during a g... \n", - "5338 My credit report contains errors that is keepi... \n", - "5582 Coast Professional, XXXX, LA contacted me by m... \n", - "6386 Cares act refund requested in XXXX, called mul... \n", - "6956 n accordance with the Fair Credit Reporting ac... \n", + "1030 Hello, I would like to complain about PayPal H... \n", + "1582 Transunion is listing personal information ( n... \n", + "1600 On XX/XX/XXXX, I called Citizen Bank at XXXX t... \n", + "2060 Theses names are the known liars that I have s... \n", + "2283 My house was hit by a tree XX/XX/2018. My insu... \n", + "2421 I became aware of a credit inquiry on my XXXX... \n", + "2422 I have sent numerous letters, police reports a... \n", + "2658 This letter concerns two disputes ( chargeback... \n", + "2883 It is very frustrating that this has been goin... \n", + "2951 I, the consumer, in fact, have a right to priv... \n", + "2992 XXXX XXXX XXXX should not be reporting to Expe... \n", + "3969 DEAR CFPB ; XXXX ; XXXX ; AND TRANSUNION ; SEE... \n", + "4087 This debt was from my identity being stolen I ... \n", + "4326 The items that are reflected on my credit repo... \n", + "4682 I filed for chapter XXXX bankruptcy on XXXX... \n", + "5005 There are 2 Inquires on my credit report that ... \n", + "5144 My mortgage was sold from XXXX XXXX to freed... \n", + "6090 On XX/XX/XXXX this company received certified... \n", + "6449 After hours on the phone with multiple agents,... \n", + "6486 On XX/XX/2019 two charges one for XXXX and one... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -816,13 +814,13 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 2c99b34a-1956-4de7-8330-898f1f25560b is DONE. 71.9 MB processed. Open Job" + "Query job 16915c47-ab13-4d06-94aa-9ebdb65d91fe is DONE. 72.0 MB processed. Open Job" ], "text/plain": [ "" @@ -834,7 +832,7 @@ { "data": { "text/html": [ - "Query job 3ffed5f8-935a-4a3f-a560-6416445e4868 is DONE. 0 Bytes processed. Open Job" + "Query job 4ab4fbf0-6fd3-4936-9915-cfd7ccd106d1 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -846,7 +844,7 @@ { "data": { "text/html": [ - "Query job 7b55783a-6d8f-41b9-b404-73253140029a is DONE. 72.3 MB processed. Open Job" + "Query job b11d3794-6bb8-4c47-a91b-dcc472cf4d69 is DONE. 72.4 MB processed. Open Job" ], "text/plain": [ "" @@ -876,187 +874,188 @@ " \n", " \n", " \n", - " text_embedding\n", - " statistics\n", - " ml_embed_text_status\n", + " ml_generate_embedding_result\n", + " ml_generate_embedding_statistics\n", + " ml_generate_embedding_status\n", " content\n", " \n", " \n", " \n", " \n", - " 782\n", - " [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-...\n", - " {\"token_count\":121,\"truncated\":false}\n", + " 415\n", + " [ 2.56774724e-02 -1.06168222e-02 3.06945704e-...\n", + " {\"token_count\":171,\"truncated\":false}\n", " \n", - " I 've sent multiple letters to this agency abo...\n", + " DEPT OF EDUCATION/XXXX is stating I was late ...\n", " \n", " \n", - " 795\n", - " [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-...\n", - " {\"token_count\":141,\"truncated\":false}\n", + " 596\n", + " [ 5.90653270e-02 -9.31344274e-03 -7.12460047e-...\n", + " {\"token_count\":668,\"truncated\":false}\n", " \n", - " I receive social security XXXX funds in my XXX...\n", + " I alerted my credit card company XX/XX/2017 th...\n", " \n", " \n", - " 861\n", - " [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-...\n", - " {\"token_count\":160,\"truncated\":false}\n", + " 706\n", + " [ 0.01298233 0.00130001 0.01800315 0.037078...\n", + " {\"token_count\":252,\"truncated\":false}\n", " \n", - " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", + " Sallie mae is corrupt. \n", + "I have tried to talk t...\n", " \n", " \n", - " 1103\n", - " [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-...\n", - " {\"token_count\":31,\"truncated\":false}\n", + " 804\n", + " [-1.39777679e-02 1.68943349e-02 5.53999236e-...\n", + " {\"token_count\":412,\"truncated\":false}\n", " \n", - " The debt occurred more than 7 years in the pas...\n", + " In accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 1241\n", - " [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-...\n", - " {\"token_count\":23,\"truncated\":false}\n", + " 861\n", + " [ 2.33309343e-02 -2.36528926e-03 3.37129943e-...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA...\n", + " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", " \n", " \n", - " 1729\n", - " [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-...\n", - " {\"token_count\":382,\"truncated\":false}\n", + " 1030\n", + " [ 0.06060313 -0.06495965 -0.03605044 -0.028016...\n", + " {\"token_count\":298,\"truncated\":false}\n", " \n", - " XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort...\n", + " Hello, I would like to complain about PayPal H...\n", " \n", " \n", - " 2167\n", - " [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-...\n", - " {\"token_count\":556,\"truncated\":false}\n", + " 1582\n", + " [ 0.01255985 -0.01652482 -0.02638046 0.036858...\n", + " {\"token_count\":814,\"truncated\":false}\n", " \n", - " This is the third such complaint I have submit...\n", + " Transunion is listing personal information ( n...\n", " \n", " \n", - " 2219\n", - " [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-...\n", - " {\"token_count\":196,\"truncated\":false}\n", + " 1600\n", + " [ 5.13355099e-02 4.01246967e-03 5.72342947e-...\n", + " {\"token_count\":653,\"truncated\":false}\n", " \n", - " Found and add online for a Prepaid Credit card...\n", + " On XX/XX/XXXX, I called Citizen Bank at XXXX t...\n", " \n", " \n", - " 2392\n", - " [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-...\n", - " {\"token_count\":641,\"truncated\":false}\n", + " 2060\n", + " [ 6.44792162e-04 4.95899878e-02 4.67925966e-...\n", + " {\"token_count\":136,\"truncated\":false}\n", " \n", - " I am furnishing this complaint against Fed Loa...\n", + " Theses names are the known liars that I have s...\n", " \n", " \n", - " 2528\n", - " [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-...\n", - " {\"token_count\":176,\"truncated\":false}\n", + " 2283\n", + " [ 4.71848622e-02 -8.68239347e-03 5.80501892e-...\n", + " {\"token_count\":478,\"truncated\":false}\n", " \n", - " Despite multiple written requests, the unverif...\n", + " My house was hit by a tree XX/XX/2018. My insu...\n", " \n", " \n", - " 2737\n", - " [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-...\n", - " {\"token_count\":230,\"truncated\":false}\n", + " 2421\n", + " [-2.90394691e-03 -1.81679502e-02 -7.99657404e-...\n", + " {\"token_count\":389,\"truncated\":false}\n", " \n", - " After unsatisfying communication in the messag...\n", + " I became aware of a credit inquiry on my XXXX...\n", " \n", " \n", - " 2859\n", - " [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-...\n", - " {\"token_count\":238,\"truncated\":false}\n", + " 2422\n", + " [-6.70500053e-03 1.51133696e-02 4.94448021e-...\n", + " {\"token_count\":124,\"truncated\":false}\n", " \n", - " Good Morning. My name is XXXX XXXX. My account...\n", + " I have sent numerous letters, police reports a...\n", " \n", " \n", - " 3439\n", - " [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-...\n", - " {\"token_count\":197,\"truncated\":false}\n", + " 2658\n", + " [ 6.70989677e-02 -3.53626162e-02 1.08648362e-...\n", + " {\"token_count\":762,\"truncated\":false}\n", " \n", - " I have ongoing disputes that are preventing me...\n", + " This letter concerns two disputes ( chargeback...\n", " \n", " \n", - " 3738\n", - " [ 0.01422119 -0.01114973 -0.04438976 -0.024421...\n", - " {\"token_count\":160,\"truncated\":false}\n", + " 2883\n", + " [-1.28255319e-02 -1.89735275e-02 5.68657108e-...\n", + " {\"token_count\":71,\"truncated\":false}\n", " \n", - " I had a loan with national Collegiate Trust. i...\n", + " It is very frustrating that this has been goin...\n", " \n", " \n", - " 3805\n", - " [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-...\n", - " {\"token_count\":477,\"truncated\":false}\n", + " 2951\n", + " [ 3.23301251e-03 -2.61142217e-02 1.31891826e-...\n", + " {\"token_count\":95,\"truncated\":false}\n", " \n", - " Hi I am submitting this XXXX XXXX this isn't a...\n", + " I, the consumer, in fact, have a right to priv...\n", " \n", " \n", - " 3915\n", - " [-7.23852217e-03 -4.69538383e-02 -5.60489520e-...\n", - " {\"token_count\":116,\"truncated\":false}\n", + " 2992\n", + " [-2.22910382e-03 -1.07050659e-02 4.74211425e-...\n", + " {\"token_count\":407,\"truncated\":false}\n", " \n", - " portfolio is showin on my credit report with a...\n", + " XXXX XXXX XXXX should not be reporting to Expe...\n", " \n", " \n", - " 3917\n", - " [-8.92711710e-03 -4.49132621e-02 -4.29662578e-...\n", - " {\"token_count\":71,\"truncated\":false}\n", + " 3969\n", + " [ 1.58297736e-02 3.01055871e-02 5.60088176e-...\n", + " {\"token_count\":287,\"truncated\":false}\n", " \n", - " the company shared my information with another...\n", + " DEAR CFPB ; XXXX ; XXXX ; AND TRANSUNION ; SEE...\n", " \n", " \n", - " 4281\n", - " [-1.69487391e-02 -1.89835522e-02 -3.80971469e-...\n", - " {\"token_count\":130,\"truncated\":false}\n", + " 4087\n", + " [ 1.99207035e-03 -7.62321474e-03 7.92114343e-...\n", + " {\"token_count\":88,\"truncated\":false}\n", " \n", - " I tried to submit a teacher loan forgiveness a...\n", + " This debt was from my identity being stolen I ...\n", " \n", " \n", - " 4470\n", - " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", - " {\"token_count\":200,\"truncated\":false}\n", + " 4326\n", + " [ 3.44273262e-02 -3.36350128e-02 1.91939529e-...\n", + " {\"token_count\":52,\"truncated\":false}\n", " \n", - " in accordance with the Fair Credit Reporting a...\n", + " The items that are reflected on my credit repo...\n", " \n", " \n", - " 4915\n", - " [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-...\n", - " {\"token_count\":23,\"truncated\":false}\n", + " 4682\n", + " [ 2.47727744e-02 -1.77769139e-02 4.63737026e-...\n", + " {\"token_count\":284,\"truncated\":false}\n", " \n", - " XXXX XXXX did not give me a receipt or a copy ...\n", + " I filed for chapter XXXX bankruptcy on XXXX...\n", " \n", " \n", - " 4928\n", - " [-4.43694415e-03 -3.66776163e-04 -9.08496231e-...\n", - " {\"token_count\":83,\"truncated\":false}\n", + " 5005\n", + " [ 2.51834448e-02 -4.92606424e-02 -1.37688573e-...\n", + " {\"token_count\":17,\"truncated\":false}\n", " \n", - " This company has filed a civil suit during a g...\n", + " There are 2 Inquires on my credit report that ...\n", " \n", " \n", - " 5338\n", - " [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-...\n", - " {\"token_count\":1279,\"truncated\":false}\n", + " 5144\n", + " [ 3.26358266e-02 -3.67171178e-03 3.65621522e-...\n", + " {\"token_count\":105,\"truncated\":false}\n", " \n", - " My credit report contains errors that is keepi...\n", + " My mortgage was sold from XXXX XXXX to freed...\n", " \n", " \n", - " 5582\n", - " [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-...\n", - " {\"token_count\":396,\"truncated\":false}\n", + " 6090\n", + " [ 2.47520711e-02 1.09149124e-02 1.35175223e-...\n", + " {\"token_count\":545,\"truncated\":false}\n", " \n", - " Coast Professional, XXXX, LA contacted me by m...\n", + " On XX/XX/XXXX this company received certified...\n", " \n", " \n", - " 6386\n", - " [ 3.33276950e-02 1.53224478e-02 -1.89354066e-...\n", - " {\"token_count\":79,\"truncated\":false}\n", + " 6449\n", + " [ 1.86854266e-02 1.31238240e-03 -4.96791191e-...\n", + " {\"token_count\":104,\"truncated\":false}\n", " \n", - " Cares act refund requested in XXXX, called mul...\n", + " After hours on the phone with multiple agents,...\n", " \n", " \n", - " 6956\n", - " [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-...\n", - " {\"token_count\":194,\"truncated\":false}\n", + " 6486\n", + " [ 1.56347770e-02 2.23377198e-02 -1.32683543e-...\n", + " {\"token_count\":211,\"truncated\":false}\n", " \n", - " n accordance with the Fair Credit Reporting ac...\n", + " On XX/XX/2019 two charges one for XXXX and one...\n", " \n", " \n", "\n", @@ -1064,102 +1063,103 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "782 [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-... \n", - "795 [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-... \n", - "861 [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-... \n", - "1103 [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-... \n", - "1241 [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-... \n", - "1729 [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-... \n", - "2167 [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-... \n", - "2219 [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-... \n", - "2392 [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-... \n", - "2528 [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-... \n", - "2737 [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-... \n", - "2859 [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-... \n", - "3439 [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-... \n", - "3738 [ 0.01422119 -0.01114973 -0.04438976 -0.024421... \n", - "3805 [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-... \n", - "3915 [-7.23852217e-03 -4.69538383e-02 -5.60489520e-... \n", - "3917 [-8.92711710e-03 -4.49132621e-02 -4.29662578e-... \n", - "4281 [-1.69487391e-02 -1.89835522e-02 -3.80971469e-... \n", - "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", - "4915 [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-... \n", - "4928 [-4.43694415e-03 -3.66776163e-04 -9.08496231e-... \n", - "5338 [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-... \n", - "5582 [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-... \n", - "6386 [ 3.33276950e-02 1.53224478e-02 -1.89354066e-... \n", - "6956 [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-... \n", + " ml_generate_embedding_result \\\n", + "415 [ 2.56774724e-02 -1.06168222e-02 3.06945704e-... \n", + "596 [ 5.90653270e-02 -9.31344274e-03 -7.12460047e-... \n", + "706 [ 0.01298233 0.00130001 0.01800315 0.037078... \n", + "804 [-1.39777679e-02 1.68943349e-02 5.53999236e-... \n", + "861 [ 2.33309343e-02 -2.36528926e-03 3.37129943e-... \n", + "1030 [ 0.06060313 -0.06495965 -0.03605044 -0.028016... \n", + "1582 [ 0.01255985 -0.01652482 -0.02638046 0.036858... \n", + "1600 [ 5.13355099e-02 4.01246967e-03 5.72342947e-... \n", + "2060 [ 6.44792162e-04 4.95899878e-02 4.67925966e-... \n", + "2283 [ 4.71848622e-02 -8.68239347e-03 5.80501892e-... \n", + "2421 [-2.90394691e-03 -1.81679502e-02 -7.99657404e-... \n", + "2422 [-6.70500053e-03 1.51133696e-02 4.94448021e-... \n", + "2658 [ 6.70989677e-02 -3.53626162e-02 1.08648362e-... \n", + "2883 [-1.28255319e-02 -1.89735275e-02 5.68657108e-... \n", + "2951 [ 3.23301251e-03 -2.61142217e-02 1.31891826e-... \n", + "2992 [-2.22910382e-03 -1.07050659e-02 4.74211425e-... \n", + "3969 [ 1.58297736e-02 3.01055871e-02 5.60088176e-... \n", + "4087 [ 1.99207035e-03 -7.62321474e-03 7.92114343e-... \n", + "4326 [ 3.44273262e-02 -3.36350128e-02 1.91939529e-... \n", + "4682 [ 2.47727744e-02 -1.77769139e-02 4.63737026e-... \n", + "5005 [ 2.51834448e-02 -4.92606424e-02 -1.37688573e-... \n", + "5144 [ 3.26358266e-02 -3.67171178e-03 3.65621522e-... \n", + "6090 [ 2.47520711e-02 1.09149124e-02 1.35175223e-... \n", + "6449 [ 1.86854266e-02 1.31238240e-03 -4.96791191e-... \n", + "6486 [ 1.56347770e-02 2.23377198e-02 -1.32683543e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "782 {\"token_count\":121,\"truncated\":false} \n", - "795 {\"token_count\":141,\"truncated\":false} \n", - "861 {\"token_count\":160,\"truncated\":false} \n", - "1103 {\"token_count\":31,\"truncated\":false} \n", - "1241 {\"token_count\":23,\"truncated\":false} \n", - "1729 {\"token_count\":382,\"truncated\":false} \n", - "2167 {\"token_count\":556,\"truncated\":false} \n", - "2219 {\"token_count\":196,\"truncated\":false} \n", - "2392 {\"token_count\":641,\"truncated\":false} \n", - "2528 {\"token_count\":176,\"truncated\":false} \n", - "2737 {\"token_count\":230,\"truncated\":false} \n", - "2859 {\"token_count\":238,\"truncated\":false} \n", - "3439 {\"token_count\":197,\"truncated\":false} \n", - "3738 {\"token_count\":160,\"truncated\":false} \n", - "3805 {\"token_count\":477,\"truncated\":false} \n", - "3915 {\"token_count\":116,\"truncated\":false} \n", - "3917 {\"token_count\":71,\"truncated\":false} \n", - "4281 {\"token_count\":130,\"truncated\":false} \n", - "4470 {\"token_count\":200,\"truncated\":false} \n", - "4915 {\"token_count\":23,\"truncated\":false} \n", - "4928 {\"token_count\":83,\"truncated\":false} \n", - "5338 {\"token_count\":1279,\"truncated\":false} \n", - "5582 {\"token_count\":396,\"truncated\":false} \n", - "6386 {\"token_count\":79,\"truncated\":false} \n", - "6956 {\"token_count\":194,\"truncated\":false} \n", + " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", + "415 {\"token_count\":171,\"truncated\":false} \n", + "596 {\"token_count\":668,\"truncated\":false} \n", + "706 {\"token_count\":252,\"truncated\":false} \n", + "804 {\"token_count\":412,\"truncated\":false} \n", + "861 {\"token_count\":160,\"truncated\":false} \n", + "1030 {\"token_count\":298,\"truncated\":false} \n", + "1582 {\"token_count\":814,\"truncated\":false} \n", + "1600 {\"token_count\":653,\"truncated\":false} \n", + "2060 {\"token_count\":136,\"truncated\":false} \n", + "2283 {\"token_count\":478,\"truncated\":false} \n", + "2421 {\"token_count\":389,\"truncated\":false} \n", + "2422 {\"token_count\":124,\"truncated\":false} \n", + "2658 {\"token_count\":762,\"truncated\":false} \n", + "2883 {\"token_count\":71,\"truncated\":false} \n", + "2951 {\"token_count\":95,\"truncated\":false} \n", + "2992 {\"token_count\":407,\"truncated\":false} \n", + "3969 {\"token_count\":287,\"truncated\":false} \n", + "4087 {\"token_count\":88,\"truncated\":false} \n", + "4326 {\"token_count\":52,\"truncated\":false} \n", + "4682 {\"token_count\":284,\"truncated\":false} \n", + "5005 {\"token_count\":17,\"truncated\":false} \n", + "5144 {\"token_count\":105,\"truncated\":false} \n", + "6090 {\"token_count\":545,\"truncated\":false} \n", + "6449 {\"token_count\":104,\"truncated\":false} \n", + "6486 {\"token_count\":211,\"truncated\":false} \n", "\n", " content \n", - "782 I 've sent multiple letters to this agency abo... \n", - "795 I receive social security XXXX funds in my XXX... \n", + "415 DEPT OF EDUCATION/XXXX is stating I was late ... \n", + "596 I alerted my credit card company XX/XX/2017 th... \n", + "706 Sallie mae is corrupt. \n", + "I have tried to talk t... \n", + "804 In accordance with the Fair Credit Reporting a... \n", "861 Hello, My name is XXXX XXXX XXXX. I have a pro... \n", - "1103 The debt occurred more than 7 years in the pas... \n", - "1241 UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA... \n", - "1729 XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort... \n", - "2167 This is the third such complaint I have submit... \n", - "2219 Found and add online for a Prepaid Credit card... \n", - "2392 I am furnishing this complaint against Fed Loa... \n", - "2528 Despite multiple written requests, the unverif... \n", - "2737 After unsatisfying communication in the messag... \n", - "2859 Good Morning. My name is XXXX XXXX. My account... \n", - "3439 I have ongoing disputes that are preventing me... \n", - "3738 I had a loan with national Collegiate Trust. i... \n", - "3805 Hi I am submitting this XXXX XXXX this isn't a... \n", - "3915 portfolio is showin on my credit report with a... \n", - "3917 the company shared my information with another... \n", - "4281 I tried to submit a teacher loan forgiveness a... \n", - "4470 in accordance with the Fair Credit Reporting a... \n", - "4915 XXXX XXXX did not give me a receipt or a copy ... \n", - "4928 This company has filed a civil suit during a g... \n", - "5338 My credit report contains errors that is keepi... \n", - "5582 Coast Professional, XXXX, LA contacted me by m... \n", - "6386 Cares act refund requested in XXXX, called mul... \n", - "6956 n accordance with the Fair Credit Reporting ac... \n", + "1030 Hello, I would like to complain about PayPal H... \n", + "1582 Transunion is listing personal information ( n... \n", + "1600 On XX/XX/XXXX, I called Citizen Bank at XXXX t... \n", + "2060 Theses names are the known liars that I have s... \n", + "2283 My house was hit by a tree XX/XX/2018. My insu... \n", + "2421 I became aware of a credit inquiry on my XXXX... \n", + "2422 I have sent numerous letters, police reports a... \n", + "2658 This letter concerns two disputes ( chargeback... \n", + "2883 It is very frustrating that this has been goin... \n", + "2951 I, the consumer, in fact, have a right to priv... \n", + "2992 XXXX XXXX XXXX should not be reporting to Expe... \n", + "3969 DEAR CFPB ; XXXX ; XXXX ; AND TRANSUNION ; SEE... \n", + "4087 This debt was from my identity being stolen I ... \n", + "4326 The items that are reflected on my credit repo... \n", + "4682 I filed for chapter XXXX bankruptcy on XXXX... \n", + "5005 There are 2 Inquires on my credit report that ... \n", + "5144 My mortgage was sold from XXXX XXXX to freed... \n", + "6090 On XX/XX/XXXX this company received certified... \n", + "6449 After hours on the phone with multiple agents,... \n", + "6486 On XX/XX/2019 two charges one for XXXX and one... \n", "...\n", "\n", "[10000 rows x 4 columns]" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "successful_rows = (\n", - " (predicted_embeddings[\"ml_embed_text_status\"] == \"\")\n", + " (predicted_embeddings[\"ml_generate_embedding_status\"] == \"\")\n", " # Series.str.len() gives the length of an array.\n", " # See: https://stackoverflow.com/a/41340543/101923\n", - " & (predicted_embeddings[\"text_embedding\"].str.len() != 0)\n", + " & (predicted_embeddings[\"ml_generate_embedding_result\"].str.len() != 0)\n", ")\n", "predicted_embeddings = predicted_embeddings[successful_rows]\n", "predicted_embeddings\n" @@ -1185,7 +1185,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "id": "AhNTnEC5FRz2" }, @@ -1206,7 +1206,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": { "id": "6poSxh-fGJF7" }, @@ -1214,19 +1214,7 @@ { "data": { "text/html": [ - "Query job 46da96c8-c454-44d3-8b98-0e1bfeca69dd is DONE. 61.7 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job dc6fe7cf-329d-4274-aff9-0b8dc2e56230 is DONE. 0 Bytes processed. Open Job" + "Query job 3e01544b-9bc2-4298-8f7d-1e9f186ac72f is DONE. 61.6 MB processed. Open Job" ], "text/plain": [ "" @@ -1238,7 +1226,7 @@ { "data": { "text/html": [ - "Query job 8c25a14a-af39-40a9-add5-de0f14bce9ce is DONE. 72.4 MB processed. Open Job" + "Query job 8aca135c-65c3-4804-9c25-0d47fad0beb5 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1250,7 +1238,7 @@ { "data": { "text/html": [ - "Query job 0a6a45b2-7c35-4be8-91a3-391a5381553e is DONE. 80.0 kB processed. Open Job" + "Query job 0b15374d-d34b-4f2e-8a48-b77d7e7757ab is DONE. 72.7 MB processed. Open Job" ], "text/plain": [ "" @@ -1262,7 +1250,7 @@ { "data": { "text/html": [ - "Query job b5e00edd-de21-40c1-bf61-9f1affdea318 is DONE. 73.1 MB processed. Open Job" + "Query job fed90511-76f8-4aec-a988-e1a4dab711b0 is DONE. 73.2 MB processed. Open Job" ], "text/plain": [ "" @@ -1294,57 +1282,57 @@ " \n", " CENTROID_ID\n", " NEAREST_CENTROIDS_DISTANCE\n", - " text_embedding\n", - " statistics\n", - " ml_embed_text_status\n", + " ml_generate_embedding_result\n", + " ml_generate_embedding_statistics\n", + " ml_generate_embedding_status\n", " content\n", " \n", " \n", " \n", " \n", - " 1094645\n", + " 3172121\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.530282685572...\n", - " [ 7.32792774e-03 -7.59598315e-02 -4.49591577e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.756634267893...\n", + " [ 3.18095312e-02 -3.54472063e-02 -7.13569671e-...\n", " {\"token_count\":10,\"truncated\":false}\n", " \n", - " I do not have an account with this creditor\n", + " Company did not provide verification and detai...\n", " \n", " \n", - " 3372485\n", + " 2137420\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.643931578310...\n", - " [-0.00161087 -0.04956109 -0.07371692 -0.057822...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.606628249825...\n", + " [ 1.91578846e-02 5.55988774e-02 8.88887007e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " Hard inquiries in my report that I do not reco...\n", + " I have already filed a dispute with Consumer A...\n", " \n", " \n", - " 2669308\n", + " 2350775\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.599709344244...\n", - " [ 5.50241247e-02 -1.50039541e-02 -2.08624080e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.606676295233...\n", + " [ 2.25369893e-02 2.29400061e-02 -6.42273854e-...\n", " {\"token_count\":100,\"truncated\":false}\n", " \n", - " I purchase {$25.00} for stock on the cash app ...\n", + " I informed Central Financial Control & provide...\n", " \n", " \n", - " 133816\n", + " 2904146\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.618444281124...\n", - " [ 1.77251529e-02 -3.89547497e-02 -3.82236368e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.596729348974...\n", + " [ 9.35115516e-02 4.27814946e-03 4.62085977e-...\n", " {\"token_count\":100,\"truncated\":false}\n", " \n", - " BBVA fees I am in The Texas snow storm where I...\n", + " I received a letter from a collections agency ...\n", " \n", " \n", - " 2697156\n", + " 1075571\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.500398902102...\n", - " [-1.28429877e-02 -1.85956229e-02 -3.93197313e-...\n", - " {\"token_count\":1011,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.453806107968...\n", + " [-1.93953840e-03 -5.80236455e-03 8.49655271e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " After paying on my student loan for years, I o...\n", + " I have not done business with this company, i ...\n", " \n", " \n", "\n", @@ -1352,42 +1340,42 @@ ], "text/plain": [ " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", - "1094645 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.530282685572... \n", - "3372485 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.643931578310... \n", - "2669308 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.599709344244... \n", - "133816 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.618444281124... \n", - "2697156 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.500398902102... \n", + "3172121 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.756634267893... \n", + "2137420 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.606628249825... \n", + "2350775 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.606676295233... \n", + "2904146 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.596729348974... \n", + "1075571 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.453806107968... \n", "\n", - " text_embedding \\\n", - "1094645 [ 7.32792774e-03 -7.59598315e-02 -4.49591577e-... \n", - "3372485 [-0.00161087 -0.04956109 -0.07371692 -0.057822... \n", - "2669308 [ 5.50241247e-02 -1.50039541e-02 -2.08624080e-... \n", - "133816 [ 1.77251529e-02 -3.89547497e-02 -3.82236368e-... \n", - "2697156 [-1.28429877e-02 -1.85956229e-02 -3.93197313e-... \n", + " ml_generate_embedding_result \\\n", + "3172121 [ 3.18095312e-02 -3.54472063e-02 -7.13569671e-... \n", + "2137420 [ 1.91578846e-02 5.55988774e-02 8.88887007e-... \n", + "2350775 [ 2.25369893e-02 2.29400061e-02 -6.42273854e-... \n", + "2904146 [ 9.35115516e-02 4.27814946e-03 4.62085977e-... \n", + "1075571 [-1.93953840e-03 -5.80236455e-03 8.49655271e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "1094645 {\"token_count\":10,\"truncated\":false} \n", - "3372485 {\"token_count\":10,\"truncated\":false} \n", - "2669308 {\"token_count\":100,\"truncated\":false} \n", - "133816 {\"token_count\":100,\"truncated\":false} \n", - "2697156 {\"token_count\":1011,\"truncated\":false} \n", + " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", + "3172121 {\"token_count\":10,\"truncated\":false} \n", + "2137420 {\"token_count\":100,\"truncated\":false} \n", + "2350775 {\"token_count\":100,\"truncated\":false} \n", + "2904146 {\"token_count\":100,\"truncated\":false} \n", + "1075571 {\"token_count\":100,\"truncated\":false} \n", "\n", " content \n", - "1094645 I do not have an account with this creditor \n", - "3372485 Hard inquiries in my report that I do not reco... \n", - "2669308 I purchase {$25.00} for stock on the cash app ... \n", - "133816 BBVA fees I am in The Texas snow storm where I... \n", - "2697156 After paying on my student loan for years, I o... " + "3172121 Company did not provide verification and detai... \n", + "2137420 I have already filed a dispute with Consumer A... \n", + "2350775 I informed Central Financial Control & provide... \n", + "2904146 I received a letter from a collections agency ... \n", + "1075571 I have not done business with this company, i ... " ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", + "cluster_model.fit(predicted_embeddings[[\"ml_generate_embedding_result\"]])\n", "clustered_result = cluster_model.predict(predicted_embeddings)\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", @@ -1422,7 +1410,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": { "id": "2E7wXM_jGqo6" }, @@ -1430,7 +1418,7 @@ { "data": { "text/html": [ - "Query job 8d4f24d6-dc37-47d3-8b4d-4505a55c4ccc is DONE. 10.4 MB processed. Open Job" + "Query job d6c61334-255f-43fe-9a8f-9fbf6cdcb2be is DONE. 10.5 MB processed. Open Job" ], "text/plain": [ "" @@ -1442,7 +1430,7 @@ { "data": { "text/html": [ - "Query job c1f979ee-1f5d-4f37-8595-ee2167c06e63 is DONE. 10.4 MB processed. Open Job" + "Query job 03a12383-6752-45ca-9b01-36eecc74fb8a is DONE. 10.5 MB processed. Open Job" ], "text/plain": [ "" @@ -1468,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": { "id": "ZNDiueI9IP5e" }, @@ -1478,38 +1466,45 @@ "output_type": "stream", "text": [ "comment list 1:\n", - "1. This is the third such complaint I have submitted regarding the same type of issue over the past 12-18 months. \n", + "1. This debt was from my identity being stolen I didnt open any account that resulted in this collection i have completed a police report which can be verified with the XXXX police @ XXXX report # XXXX and i have a notarized identity theft affidavit from ftc please remove this off of my credit and close my file ASAP\n", + "2. On XX/XX/XXXX this company received certified mail asking for validation of debt. On XX/XX/XXXX the company still did not validate debt owed and they did not mark the debt disputed by XX/XX/XXXX through the major credit reporting bureaus. This is a violation of the FDCPA and FCRA. I did send a second letter which the company received on XX/XX/XXXX . A lady from the company called and talked to me about the debt on XX/XX/XXXX but again did not have the credit bureaus mark the item as disputed. The company still violated the laws. Section [ 15 U.S.C. 1681s-2 ] ( 3 ) duty to provide notice of dispute. If the completeness or accuracy of any information furnished by any person to any consumer reporting agency is disputed to such person by a consumer, the person may not furnish the information to any consumer reporting agency without notice that such information is disputed. ( B ) ti me of notice! The notice required under sub paragraph ( A ) shall be provided to the customer prior to, or no later than 30 days after, furnishing the negative information to a consumer reporting agency described in section 603 ( p ). This company violated the state laws. I received no information until XX/XX/XXXX . Therefore by law the company should have the item removed from the credit agencies such as transunion and XXXX . I tried to call the company back about the laws that was broken and left my name no return call. The copy of my credit reports are below and as you can see the items was n't marked disputed. XXXX is marked disputed because on XX/XX/XXXX I myself disputed the information with the credit bureau. The lady stated they did n't receive my dispute letter until XX/XX/XXXX . Included is certified mail reciepts with date, time stamp, and signature of the person who signed for the certified mail on XX/XX/XXXX and XX/XX/XXXX . So again the company violated the laws and I have all the proof. If I have a contract with this company please send to me by mail a contract bearing my signature of the contract.\n", + "3. On XX/XX/2022, Pioneer Credit Recovery of XXXX, NY identified an alleged debt, which I do not owe. \n", + "\n", + "On XX/XX/2022, I wrote a dispute letter to Pioneer, requesting that they stop communication with me, record my dispute, and provide verification of the debt if they believe otherwise. \n", "\n", - "On XX/XX/XXXX, my co-signed account was flagged by Navient as past due. The XXXX payment was mailed priority on XX/XX/XXXX and received by Navient on XX/XX/XXXX and delivered to \" an individual '' per the post office. \n", - "I called Navient on XX/XX/XXXX to talk to them about why my account was flagged since they received the payment long before the due date. The payment is sent via XXXX money orders under the same cover. The XXXX money order ( {$160.00} ) was cashed on XX/XX/XXXX per XXXX XXXX, the second money order ( {$250.00} ) which was sent in the same priority envelope and received the same time has not been cashed. \n", - "When I called the customer service agent at Navient she told me that my account was past due and wanted me to send another payment. When I explained that they had received the payment she argued with me that if they received it, the payment would have been cashed. I asked to speak with a supervisor. \n", - "I was connected with supervisor, XXXX XXXX, who asked that I send copies of the payments to him so he could submit for a missing payment request. I faxed the proof on XXXX @ XXXX with a receipt acknowledgment. \n", - "On XX/XX/XXXX, the payment was still not applied to the account. When I called XXXX XXXX, the money order was still not cashed. I called Navient again. Because of an argumentative customer service rep again, I requested to speak with a supervisor. I spoke with XXXX XXXX. She states that payment was not received. I explained the situation again. She said the missing payment request had not been submitted. She had me upload the documents so she could request a missing payment search. I have done everything I have been asked. \n", - "This issues continues to occur. For approximately 6 months at a time, Navient gets this right then there are issues again and again. I have submitted CFPB complaints about this in the past. \n", - "I was told it would take 5-7 business days to be resolved.\n", - "2. I tried to submit a teacher loan forgiveness application and they lost my application. I submitted the application again after talking to XXXX people at Nelnet. Then when I called back to check on the status they told me that I needed to submit another form for a different school that I worked at. I had already called previously and asked if I needed to submit any other papers and they told me \" no ''. Therefore, I have been paying my loan for 5 months and it should be forgiven. I am still paying my loan because I have to wait for them to approve the new forgiveness paperwork.\n", - "3. PayPal continues to overcharge their currency rate. It it always inflated in their favor so that they collect the difference.\n", - "4. My government feeds are not coming on to my card and I need the problem fix today\n", - "5. Paypal Credit 's website is confusing and does not accurately reflect all activity. When speaking with representatives like XXXX, it 's confusing to them and they can barely follow along with it. I am not receiving statements, which proves it difficult to determine the due dates on the accounts. The Reps are n't knowledgeable and the only thing they repeat to you is the amount due on the screen.\n", + "Pioneer has not responded with verification, but has attempted to collect the debt since then by phone ( XX/XX/2022 ) and mail ( XX/XX/2022 ).\n", + "4. Disputed with the company on several occasions and they still havent provided proof in a timely manner. The FCRA gives the company 30 days to respond. I have not gotten a response.\n", + "5. I am not aware of this XXXX XXXX XXXX XXXX XXXX , XXXX balance. I have never seen anything dealing with this lender. Also, I have been threated that in 30 days they will seek to make a judgement on debt that does not belong to me. I understand that they are looking to offer me a settlement. However, I do not believe the validity of such debt accusation. Furthermore, I will not be limited to the action of court threats when I did not receive any notice of debt based on communication. The amount is {$880.00} from MBNA which was acquired by Bank of America in 2006. I do not claim debt.\n", "\n", "comment list 2:\n", - "1. XXXX on XXXX XX/XX/2021 I have Mr. Cooper mortgage for years now. On XXXX XXXX XXXX I made an additional payment of $ XXXX towards my principal. More than 4 days - it's not reflected in the Amortization schedule ( Amortization schedule is not even visible ). Even after so many additional principal payments, Payoff calculator is way off and it still shows XXXX maturity date while it should start showing something like XX/XX/XXXX/XX/XX/XXXX as the initial date. There are lots of discrepancies on their website to reflect the balance and total. When called customer service on Friday and also chatted - i was assured of fixing this - but no fix till this point of time. Customer service there is a long wait. Auto bot doesn't let customers talk to the real person. \n", - "Finally after a lots of follow-up I got the amortization schedule via email but it is not reflecting another additional principal payment of {$4700.00} made on XX/XX/2021. \n", - "I did numerous chats and phone calls. Why i should depend on inefficinent humans to see my revised amortization schedule? \n", - "Why the online amortization schedule is not visible now? \n", + "1. My name is XXXX XXXX XXXX. This issue with a Loan Till Payday account was previously reported to you for collection practices, etc. I had a pay day loan in 2013. At the time, I banked with XXXX XXXX, who advised me that pay day loans are not good, and in the end XXXX closed my bank account, it was involuntary. In the interim, I made payments to the agency. XXXX and XXXX were the primary contacts. On the last payment, due to the fact that I told him I was coming in to pay cash, and they withdrew the funds, electronically, my account was affected. XXXX advised me that the payment made was the last payment and the other ( which was primarily interest remaining ) would be charged off. XXXX later called me and advised that XXXX was not authorized to make that decision and demanded the payment. I do n't understand how one person can cancel the arrangements made by someone else. \n", + "\n", + "In the end, they sold my account. It was reported to you, and that creditor then stated no further collection activity would occur. \n", + "\n", + "Last week I began receiving calls from a collection agency, XXXX XXXX stating I would called for a civil deposition on this account. I do n't even know this agency. Later, I then received another call stating that I needed to hold, and after several clicks was connected to someone at a Mediaction service. I denied the owing the loan and stated it was paid. \n", + "\n", + "Today, I received a call from an outsource service courier about a missed appointment or hearing??? What?? I have no idea who these people are. I called Loan Till Payday and was advised the loan was sold and I needed to settle with the new company. So, does this mean they are continuing to attempt to collect {$200.00}. \n", "\n", - "Worst thing, after turning on escrow account, there is no transparency. Amount of escrow account is not in sync with all the sections of my online account. It seems that there are too many internal buggy systems in Mr. Cooper and everybody from different IT department is showing a different $ $ figure at various places. \n", - "Highly inefficient organization.\n", - "2. I had a loan with national Collegiate Trust. i did n't make any payments on the loan for 6 years and due to the \" statute to limitations '' the loan collector should not be contacting me any more, by they still are in both forms phone call 's and letters. I am also trying to buy a house for my family and found out that i can not get a USDA loan for the house because National Collegiate Trust has filled the loan as a government delinquent loan. The problem with that is the loan is a private loan not a federal/Government loan. due to the way they filled the lion on my credit report i am not able to buy my first home.\n", - "3. Cares act refund requested in XXXX, called multiple times given conflicting information every time. Its now XXXX and looking like its going to be even longer potentially months before the refund arrives. Blatant disregard for the cares act signed into law and terrible customer service. This company is ill suited to service the federal government contract it enjoys.\n", - "4. In an attempt to pay off my loan with Ocwen, I sent XXXX large payments to the institution on XXXX XXXX that would have decreased my loan amount to within a couple of hundred dollars ( you can not send XXXX payment that exceeds 80 % of the loan balance so I broke the payments up into XXXX XXXX. I scheduled the payments for the same day because I did not want any interest to accrue. After a few days, I noticed where the XXXX payments were withdrawn from my bank but that my Ocwen balance had not changed to reflect my XXXX payments. I called Ocwen on XXXX XXXX to ask about the payment. The Ocwen rep explained that because the XXXX payments were scheduled for the same day, that it exceeded the 80 % max limit. I asked Ocwen to return my XXXX payments. The rep said it would take XXXX business days. I called Ocwen back on XXXX XXXX when my funds had not been returned to my bank account. I gave them my account number and routing number to my bank, information that I am sure they already had since my monthly mortgage payments are debited from my account. They asked me to wait a couple more days. I called Ocwen back on XXXX XXXX. The rep asked me to be patient and assured me that the funds would be returned by XXXX XXXX. There were no funds returned to my account on XXXX XXXX. I called Ocwen again. I was asked to wait 40+ minutes on hold while the Ocwen rep put me in touch with an escalation specialist ( ES ). The ES told me that my funds would be reurned within XXXX hours and that he had put it in as a priority because I had called so often. There were no funds on XXXX XXXX. I called Ocwen again to see if there was a problem. There was. After speaking to a rep and another ES, I was told that my funds could not be returned to me! The ES said that he did not see my funds! He claimed to put me in touch with someone who XXXX be able to address my concerns. So that 's where I am now, on hold waiting to speak to yet another person! This is a significant amount of money and I fear that Ocwen is trying to get away with keeping my XXXX payments!\n", - "5. In XX/XX/XXXX we received our first bill from XXXX XXXX for XXXX. ( attached ) We promptly paid the bill. Again, in XX/XX/XXXX we paid our second bill for XXXX. Again, both on time. Then when XX/XX/XXXX statement came we were billed XXXX. My husband called XXXX XXXX to find out what the issue was. We were told there was a loan shortfall caused by us paying XXXX ( the amount we were billed ) and that our loan was re-amoritized. I argued I had a fixed rate loan, had never missed a payment, had never made a late payment, and paid exactly what was billed. Well, after double checking my promissory note ( attached ) and TILA ( attached ) I was to always be billed XXXX. XXXX changed the monthly payment and thus caused a shortfall. When I told them this information they refused to correct the shortfall. Not only did they not correct the mistake they refused to return my calls or emails to provide answers for these issues. Around 90 days later and this issue still exists and they still refuse to answer. Additionally I offered to make up the shortfall myself by offering a check for the missing amount and they just applied it to interest. Thus the shortfall still exists. The extra amount would have gone directly to principal. Additionally, in XX/XX/XXXX we made an extra payment amount on top of the monthly payment. This was made all at the same time. The letter we sent contained directions to only apply extra payments beyond the monthly requirement be applied to principal and not the next months payment. This was ignored. Then XXXX \" a higher up '' as she calls herself lied and told me it went to principal when clearly it did not. We requested this be fixed and it has not been done. No one has offered to fix anything either. A certified letter is attached that I mailed. I also made dozens of calls.\n", + "I attempted to call the numbers, and now no one picks up just a voicemail. I called the supposed service courier and advised that their number was showing up as a spam/fraud number and that if they were a legitimate company then they should leave their name, location, a number ( not a voicemail ), and the case they are calling me about. I have not been served with any collection documents - why am I being threatened with a deposition??? \n", + "\n", + "Telephone number recently calling me : ( XXXX ) XXXX. \n", + "\n", + "Please help.\n", + "2. I receive 2 or 3 phone calls every day since early XXXX, my references receive calls. I will gladly satisfy this debt however even after 1st telling them the calls haven't stopped as though they are going to intimidate me. If the calls stopped for just 3 or 4 days I would satisfy my obligation but not because they keep calling me as well as my references.\n", + "3. Last month I received a phone call for my husband from XXXX XXXX XXXX saying he owed money and if I did not pay today it would be sent to litigation. The debt was Wachovia/wells Fargo, and account that we have never had. I had my husband call to get more information and they became very nasty with him. I called back asking for documentation on the debt because i did not think it was our debt and they became aggressive. They did email my husband something saying how much he owed, and I called back and asked to be emailed a copy, and the dollar amounts did not match. I called Wells Fargo and went over the above and verified that we have never had an account with them and I sent them the emails the XXXX sent to us and they started a fraud investigation. Yesterday I received another collections letter in the mail from the. Still trying to collect this debt. These people have my husbands full social security number ( we did not give it to them )\n", + "4. A company call XXXX XXXX XXXX came onto my private property on XX/XX/2018 and stole my automobile. I did receive any type of notice saying they collecting on a debt. If they take or threaten to take any nonjudicial action ( i.e, without a court order ) to repossess property when there is no present right to possession of the property they is in violation. l did not receive any type of notice asking if they can enter onto my private property and steal my private automobile.\n", + "5. Navient financial continues to send me erroneous debt collection emails. I have repeatedly asked them to remove my email address and to cease all communication with me. \n", + "I have no relationship with Navient and their continued threatening email is very unsettling. \n", + "\n", + "I just want their erroneous threats to stop. \n", + "\n", + "Below is the latest email I have received from them : Last Day to call this office XXXX by XXXX Regards, XXXX XXXX Team Lead Specialist Charge off Unit XXXX XXXX\n", "\n" ] } ], "source": [ - "# Build plain-text prompts to send to PaLM 2. Use only 5 complaints from each group.\n", + "# Build plain-text prompts to send to Gemini. Use only 5 complaints from each group.\n", "prompt1 = 'comment list 1:\\n'\n", "for i in range(5):\n", " prompt1 += str(i + 1) + '. ' + \\\n", @@ -1526,7 +1521,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": { "id": "BfHGJLirzSvH" }, @@ -1537,37 +1532,44 @@ "text": [ "Please highlight the most obvious difference between the two lists of comments:\n", "comment list 1:\n", - "1. This is the third such complaint I have submitted regarding the same type of issue over the past 12-18 months. \n", + "1. This debt was from my identity being stolen I didnt open any account that resulted in this collection i have completed a police report which can be verified with the XXXX police @ XXXX report # XXXX and i have a notarized identity theft affidavit from ftc please remove this off of my credit and close my file ASAP\n", + "2. On XX/XX/XXXX this company received certified mail asking for validation of debt. On XX/XX/XXXX the company still did not validate debt owed and they did not mark the debt disputed by XX/XX/XXXX through the major credit reporting bureaus. This is a violation of the FDCPA and FCRA. I did send a second letter which the company received on XX/XX/XXXX . A lady from the company called and talked to me about the debt on XX/XX/XXXX but again did not have the credit bureaus mark the item as disputed. The company still violated the laws. Section [ 15 U.S.C. 1681s-2 ] ( 3 ) duty to provide notice of dispute. If the completeness or accuracy of any information furnished by any person to any consumer reporting agency is disputed to such person by a consumer, the person may not furnish the information to any consumer reporting agency without notice that such information is disputed. ( B ) ti me of notice! The notice required under sub paragraph ( A ) shall be provided to the customer prior to, or no later than 30 days after, furnishing the negative information to a consumer reporting agency described in section 603 ( p ). This company violated the state laws. I received no information until XX/XX/XXXX . Therefore by law the company should have the item removed from the credit agencies such as transunion and XXXX . I tried to call the company back about the laws that was broken and left my name no return call. The copy of my credit reports are below and as you can see the items was n't marked disputed. XXXX is marked disputed because on XX/XX/XXXX I myself disputed the information with the credit bureau. The lady stated they did n't receive my dispute letter until XX/XX/XXXX . Included is certified mail reciepts with date, time stamp, and signature of the person who signed for the certified mail on XX/XX/XXXX and XX/XX/XXXX . So again the company violated the laws and I have all the proof. If I have a contract with this company please send to me by mail a contract bearing my signature of the contract.\n", + "3. On XX/XX/2022, Pioneer Credit Recovery of XXXX, NY identified an alleged debt, which I do not owe. \n", "\n", - "On XX/XX/XXXX, my co-signed account was flagged by Navient as past due. The XXXX payment was mailed priority on XX/XX/XXXX and received by Navient on XX/XX/XXXX and delivered to \" an individual '' per the post office. \n", - "I called Navient on XX/XX/XXXX to talk to them about why my account was flagged since they received the payment long before the due date. The payment is sent via XXXX money orders under the same cover. The XXXX money order ( {$160.00} ) was cashed on XX/XX/XXXX per XXXX XXXX, the second money order ( {$250.00} ) which was sent in the same priority envelope and received the same time has not been cashed. \n", - "When I called the customer service agent at Navient she told me that my account was past due and wanted me to send another payment. When I explained that they had received the payment she argued with me that if they received it, the payment would have been cashed. I asked to speak with a supervisor. \n", - "I was connected with supervisor, XXXX XXXX, who asked that I send copies of the payments to him so he could submit for a missing payment request. I faxed the proof on XXXX @ XXXX with a receipt acknowledgment. \n", - "On XX/XX/XXXX, the payment was still not applied to the account. When I called XXXX XXXX, the money order was still not cashed. I called Navient again. Because of an argumentative customer service rep again, I requested to speak with a supervisor. I spoke with XXXX XXXX. She states that payment was not received. I explained the situation again. She said the missing payment request had not been submitted. She had me upload the documents so she could request a missing payment search. I have done everything I have been asked. \n", - "This issues continues to occur. For approximately 6 months at a time, Navient gets this right then there are issues again and again. I have submitted CFPB complaints about this in the past. \n", - "I was told it would take 5-7 business days to be resolved.\n", - "2. I tried to submit a teacher loan forgiveness application and they lost my application. I submitted the application again after talking to XXXX people at Nelnet. Then when I called back to check on the status they told me that I needed to submit another form for a different school that I worked at. I had already called previously and asked if I needed to submit any other papers and they told me \" no ''. Therefore, I have been paying my loan for 5 months and it should be forgiven. I am still paying my loan because I have to wait for them to approve the new forgiveness paperwork.\n", - "3. PayPal continues to overcharge their currency rate. It it always inflated in their favor so that they collect the difference.\n", - "4. My government feeds are not coming on to my card and I need the problem fix today\n", - "5. Paypal Credit 's website is confusing and does not accurately reflect all activity. When speaking with representatives like XXXX, it 's confusing to them and they can barely follow along with it. I am not receiving statements, which proves it difficult to determine the due dates on the accounts. The Reps are n't knowledgeable and the only thing they repeat to you is the amount due on the screen.\n", + "On XX/XX/2022, I wrote a dispute letter to Pioneer, requesting that they stop communication with me, record my dispute, and provide verification of the debt if they believe otherwise. \n", + "\n", + "Pioneer has not responded with verification, but has attempted to collect the debt since then by phone ( XX/XX/2022 ) and mail ( XX/XX/2022 ).\n", + "4. Disputed with the company on several occasions and they still havent provided proof in a timely manner. The FCRA gives the company 30 days to respond. I have not gotten a response.\n", + "5. I am not aware of this XXXX XXXX XXXX XXXX XXXX , XXXX balance. I have never seen anything dealing with this lender. Also, I have been threated that in 30 days they will seek to make a judgement on debt that does not belong to me. I understand that they are looking to offer me a settlement. However, I do not believe the validity of such debt accusation. Furthermore, I will not be limited to the action of court threats when I did not receive any notice of debt based on communication. The amount is {$880.00} from MBNA which was acquired by Bank of America in 2006. I do not claim debt.\n", "comment list 2:\n", - "1. XXXX on XXXX XX/XX/2021 I have Mr. Cooper mortgage for years now. On XXXX XXXX XXXX I made an additional payment of $ XXXX towards my principal. More than 4 days - it's not reflected in the Amortization schedule ( Amortization schedule is not even visible ). Even after so many additional principal payments, Payoff calculator is way off and it still shows XXXX maturity date while it should start showing something like XX/XX/XXXX/XX/XX/XXXX as the initial date. There are lots of discrepancies on their website to reflect the balance and total. When called customer service on Friday and also chatted - i was assured of fixing this - but no fix till this point of time. Customer service there is a long wait. Auto bot doesn't let customers talk to the real person. \n", - "Finally after a lots of follow-up I got the amortization schedule via email but it is not reflecting another additional principal payment of {$4700.00} made on XX/XX/2021. \n", - "I did numerous chats and phone calls. Why i should depend on inefficinent humans to see my revised amortization schedule? \n", - "Why the online amortization schedule is not visible now? \n", + "1. My name is XXXX XXXX XXXX. This issue with a Loan Till Payday account was previously reported to you for collection practices, etc. I had a pay day loan in 2013. At the time, I banked with XXXX XXXX, who advised me that pay day loans are not good, and in the end XXXX closed my bank account, it was involuntary. In the interim, I made payments to the agency. XXXX and XXXX were the primary contacts. On the last payment, due to the fact that I told him I was coming in to pay cash, and they withdrew the funds, electronically, my account was affected. XXXX advised me that the payment made was the last payment and the other ( which was primarily interest remaining ) would be charged off. XXXX later called me and advised that XXXX was not authorized to make that decision and demanded the payment. I do n't understand how one person can cancel the arrangements made by someone else. \n", + "\n", + "In the end, they sold my account. It was reported to you, and that creditor then stated no further collection activity would occur. \n", + "\n", + "Last week I began receiving calls from a collection agency, XXXX XXXX stating I would called for a civil deposition on this account. I do n't even know this agency. Later, I then received another call stating that I needed to hold, and after several clicks was connected to someone at a Mediaction service. I denied the owing the loan and stated it was paid. \n", + "\n", + "Today, I received a call from an outsource service courier about a missed appointment or hearing??? What?? I have no idea who these people are. I called Loan Till Payday and was advised the loan was sold and I needed to settle with the new company. So, does this mean they are continuing to attempt to collect {$200.00}. \n", + "\n", + "I attempted to call the numbers, and now no one picks up just a voicemail. I called the supposed service courier and advised that their number was showing up as a spam/fraud number and that if they were a legitimate company then they should leave their name, location, a number ( not a voicemail ), and the case they are calling me about. I have not been served with any collection documents - why am I being threatened with a deposition??? \n", + "\n", + "Telephone number recently calling me : ( XXXX ) XXXX. \n", + "\n", + "Please help.\n", + "2. I receive 2 or 3 phone calls every day since early XXXX, my references receive calls. I will gladly satisfy this debt however even after 1st telling them the calls haven't stopped as though they are going to intimidate me. If the calls stopped for just 3 or 4 days I would satisfy my obligation but not because they keep calling me as well as my references.\n", + "3. Last month I received a phone call for my husband from XXXX XXXX XXXX saying he owed money and if I did not pay today it would be sent to litigation. The debt was Wachovia/wells Fargo, and account that we have never had. I had my husband call to get more information and they became very nasty with him. I called back asking for documentation on the debt because i did not think it was our debt and they became aggressive. They did email my husband something saying how much he owed, and I called back and asked to be emailed a copy, and the dollar amounts did not match. I called Wells Fargo and went over the above and verified that we have never had an account with them and I sent them the emails the XXXX sent to us and they started a fraud investigation. Yesterday I received another collections letter in the mail from the. Still trying to collect this debt. These people have my husbands full social security number ( we did not give it to them )\n", + "4. A company call XXXX XXXX XXXX came onto my private property on XX/XX/2018 and stole my automobile. I did receive any type of notice saying they collecting on a debt. If they take or threaten to take any nonjudicial action ( i.e, without a court order ) to repossess property when there is no present right to possession of the property they is in violation. l did not receive any type of notice asking if they can enter onto my private property and steal my private automobile.\n", + "5. Navient financial continues to send me erroneous debt collection emails. I have repeatedly asked them to remove my email address and to cease all communication with me. \n", + "I have no relationship with Navient and their continued threatening email is very unsettling. \n", "\n", - "Worst thing, after turning on escrow account, there is no transparency. Amount of escrow account is not in sync with all the sections of my online account. It seems that there are too many internal buggy systems in Mr. Cooper and everybody from different IT department is showing a different $ $ figure at various places. \n", - "Highly inefficient organization.\n", - "2. I had a loan with national Collegiate Trust. i did n't make any payments on the loan for 6 years and due to the \" statute to limitations '' the loan collector should not be contacting me any more, by they still are in both forms phone call 's and letters. I am also trying to buy a house for my family and found out that i can not get a USDA loan for the house because National Collegiate Trust has filled the loan as a government delinquent loan. The problem with that is the loan is a private loan not a federal/Government loan. due to the way they filled the lion on my credit report i am not able to buy my first home.\n", - "3. Cares act refund requested in XXXX, called multiple times given conflicting information every time. Its now XXXX and looking like its going to be even longer potentially months before the refund arrives. Blatant disregard for the cares act signed into law and terrible customer service. This company is ill suited to service the federal government contract it enjoys.\n", - "4. In an attempt to pay off my loan with Ocwen, I sent XXXX large payments to the institution on XXXX XXXX that would have decreased my loan amount to within a couple of hundred dollars ( you can not send XXXX payment that exceeds 80 % of the loan balance so I broke the payments up into XXXX XXXX. I scheduled the payments for the same day because I did not want any interest to accrue. After a few days, I noticed where the XXXX payments were withdrawn from my bank but that my Ocwen balance had not changed to reflect my XXXX payments. I called Ocwen on XXXX XXXX to ask about the payment. The Ocwen rep explained that because the XXXX payments were scheduled for the same day, that it exceeded the 80 % max limit. I asked Ocwen to return my XXXX payments. The rep said it would take XXXX business days. I called Ocwen back on XXXX XXXX when my funds had not been returned to my bank account. I gave them my account number and routing number to my bank, information that I am sure they already had since my monthly mortgage payments are debited from my account. They asked me to wait a couple more days. I called Ocwen back on XXXX XXXX. The rep asked me to be patient and assured me that the funds would be returned by XXXX XXXX. There were no funds returned to my account on XXXX XXXX. I called Ocwen again. I was asked to wait 40+ minutes on hold while the Ocwen rep put me in touch with an escalation specialist ( ES ). The ES told me that my funds would be reurned within XXXX hours and that he had put it in as a priority because I had called so often. There were no funds on XXXX XXXX. I called Ocwen again to see if there was a problem. There was. After speaking to a rep and another ES, I was told that my funds could not be returned to me! The ES said that he did not see my funds! He claimed to put me in touch with someone who XXXX be able to address my concerns. So that 's where I am now, on hold waiting to speak to yet another person! This is a significant amount of money and I fear that Ocwen is trying to get away with keeping my XXXX payments!\n", - "5. In XX/XX/XXXX we received our first bill from XXXX XXXX for XXXX. ( attached ) We promptly paid the bill. Again, in XX/XX/XXXX we paid our second bill for XXXX. Again, both on time. Then when XX/XX/XXXX statement came we were billed XXXX. My husband called XXXX XXXX to find out what the issue was. We were told there was a loan shortfall caused by us paying XXXX ( the amount we were billed ) and that our loan was re-amoritized. I argued I had a fixed rate loan, had never missed a payment, had never made a late payment, and paid exactly what was billed. Well, after double checking my promissory note ( attached ) and TILA ( attached ) I was to always be billed XXXX. XXXX changed the monthly payment and thus caused a shortfall. When I told them this information they refused to correct the shortfall. Not only did they not correct the mistake they refused to return my calls or emails to provide answers for these issues. Around 90 days later and this issue still exists and they still refuse to answer. Additionally I offered to make up the shortfall myself by offering a check for the missing amount and they just applied it to interest. Thus the shortfall still exists. The extra amount would have gone directly to principal. Additionally, in XX/XX/XXXX we made an extra payment amount on top of the monthly payment. This was made all at the same time. The letter we sent contained directions to only apply extra payments beyond the monthly requirement be applied to principal and not the next months payment. This was ignored. Then XXXX \" a higher up '' as she calls herself lied and told me it went to principal when clearly it did not. We requested this be fixed and it has not been done. No one has offered to fix anything either. A certified letter is attached that I mailed. I also made dozens of calls.\n", + "I just want their erroneous threats to stop. \n", + "\n", + "Below is the latest email I have received from them : Last Day to call this office XXXX by XXXX Regards, XXXX XXXX Team Lead Specialist Charge off Unit XXXX XXXX\n", "\n" ] } ], "source": [ - "# The plain English request we will make of PaLM 2\n", + "# The plain English request we will make of Gemini\n", "prompt = (\n", " \"Please highlight the most obvious difference between \"\n", " \"the two lists of comments:\\n\" + prompt1 + prompt2\n", @@ -1585,7 +1587,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": { "id": "mL5P0_3X04dE" }, @@ -1593,7 +1595,7 @@ { "data": { "text/html": [ - "Query job de5da6c9-96b5-42a1-b199-42687392fe37 is DONE. 0 Bytes processed. Open Job" + "Query job 3a46cad4-14e5-4137-a042-14380733b467 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1611,11 +1613,24 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": { "id": "ICWHsqAW1FNk" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Load job 939037f0-66df-42a4-b301-0b3ba26bae7c is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Make a DataFrame containing only a single row with our prompt for Gemini\n", "df = bf.DataFrame({\"prompt\": [prompt]})" @@ -1623,7 +1638,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": { "id": "gB7e1LXU1pst" }, @@ -1631,7 +1646,7 @@ { "data": { "text/html": [ - "Query job 1363c327-00b5-4835-a902-da84882bc996 is DONE. 0 Bytes processed. Open Job" + "Query job c662b2c7-7185-4681-b7c6-60c81e9c8cd4 is DONE. 8.2 kB processed. Open Job" ], "text/plain": [ "" @@ -1641,21 +1656,17 @@ "output_type": "display_data" }, { - "data": { - "text/html": [ - "Query job c5996f1e-a140-4e7d-8775-091e1a73d882 is DONE. 8 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/__init__.py:108: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] }, { "data": { "text/html": [ - "Query job db1de3ab-2e6e-4b3f-8e6a-01bad33ac45f is DONE. 2 Bytes processed. Open Job" + "Query job 9a4d6735-c307-4a60-96f9-d81330925e6c is DONE. 2 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1667,7 +1678,7 @@ { "data": { "text/html": [ - "Query job 38d9a9d0-7f03-4091-858b-f864da30987e is DONE. 375 Bytes processed. Open Job" + "Query job 17bde6e6-8b26-48a7-9c57-b7b9752c1f54 is DONE. 1.8 kB processed. Open Job" ], "text/plain": [ "" @@ -1679,10 +1690,10 @@ { "data": { "text/plain": [ - "'The most obvious difference between the two lists of comments is the subject matter. The first list of comments is primarily focused on issues with financial institutions, such as Navient, Nelnet, PayPal, and Mr. Cooper. The second list of comments is primarily focused on issues with government agencies, such as the National Collegiate Trust, the USDA, and Ocwen.'" + "\"## Key Differences between Comment Lists 1 and 2:\\n\\n**Comment List 1:**\\n\\n* **Focuses on Legal Violations:** The comments in List 1 primarily focus on how the debt collectors violated specific laws, such as the FDCPA and FCRA, by not validating debt, not marking accounts as disputed, and using illegal collection tactics.\\n* **Detailed Evidence:** Commenters provide detailed evidence of their claims, including dates, reference numbers, police reports, and copies of communications.\\n* **Formal Tone:** The language in List 1 is more formal and uses legal terminology, suggesting the commenters may have a deeper understanding of their rights.\\n* **Emphasis on Debt Accuracy:** Many comments explicitly deny owing the debt and question its validity, requesting proof and demanding removal from credit reports. \\n\\n**Comment List 2:**\\n\\n* **Focus on Harassment and Intimidation:** The comments in List 2 highlight the harassing and intimidating behavior of the debt collectors, such as making multiple calls, contacting references, and threatening legal action.\\n* **Emotional Language:** Commenters express frustration, fear, and anger towards the debt collectors' behavior.\\n* **Less Legal Detail:** While some commenters mention specific laws, they provide less detailed evidence than List 1.\\n* **Uncertainty About Debt:** Several commenters are unsure whether they actually owe the debt, questioning its origin and validity. \\n\\n**Overall:**\\n\\n* List 1 focuses on legal arguments and violations, while List 2 emphasizes emotional distress and improper collection tactics.\\n* List 1 provides more concrete evidence of wrongdoing, while List 2 relies more on personal experiences and descriptions.\\n* Both lists highlight the negative impacts of debt collection practices on individuals.\\n\"" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1690,7 +1701,7 @@ "source": [ "# Send the request for Gemini to generate a response to our prompt\n", "major_difference = q_a_model.predict(df)\n", - "# PaLM 2's response is the only row in the dataframe result \n", + "# Gemini's response is the only row in the dataframe result \n", "major_difference[\"ml_generate_text_llm_result\"].iloc[0]" ] }, From d2fc51a30c4fff6fe0b98df61eec70ddb28b37ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 20 Aug 2024 12:15:27 -0500 Subject: [PATCH 5/7] docs: add columns for "requires ordering/index" to supported APIs summary (#892) --- bigframes/core/validations.py | 15 ++++++++++++- bigframes/dataframe.py | 40 +++++++++++++-------------------- bigframes/operations/base.py | 10 --------- bigframes/series.py | 35 ++++++++++++++--------------- scripts/publish_api_coverage.py | 24 +++++++++++++++++++- 5 files changed, 69 insertions(+), 55 deletions(-) diff --git a/bigframes/core/validations.py b/bigframes/core/validations.py index 9c03ddb930..daa1252824 100644 --- a/bigframes/core/validations.py +++ b/bigframes/core/validations.py @@ -17,7 +17,7 @@ from __future__ import annotations import functools -from typing import Optional, Protocol, TYPE_CHECKING +from typing import Optional, Protocol, TYPE_CHECKING, Union import bigframes.constants import bigframes.exceptions @@ -25,6 +25,8 @@ if TYPE_CHECKING: from bigframes import Session from bigframes.core.blocks import Block + from bigframes.dataframe import DataFrame + from bigframes.operations.base import SeriesMethods class HasSession(Protocol): @@ -37,6 +39,16 @@ def _block(self) -> Block: ... +def requires_index(meth): + @functools.wraps(meth) + def guarded_meth(df: Union[DataFrame, SeriesMethods], *args, **kwargs): + df._throw_if_null_index(meth.__name__) + return meth(df, *args, **kwargs) + + guarded_meth._validations_requires_index = True # type: ignore + return guarded_meth + + def requires_ordering(suggestion: Optional[str] = None): def decorator(meth): @functools.wraps(meth) @@ -44,6 +56,7 @@ def guarded_meth(object: HasSession, *args, **kwargs): enforce_ordered(object, meth.__name__, suggestion) return meth(object, *args, **kwargs) + guarded_meth._validations_requires_ordering = True # type: ignore return guarded_meth return decorator diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index dabe85c923..a174ef0b0f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -17,7 +17,6 @@ from __future__ import annotations import datetime -import functools import inspect import re import sys @@ -92,15 +91,6 @@ ) -def requires_index(meth): - @functools.wraps(meth) - def guarded_meth(df: DataFrame, *args, **kwargs): - df._throw_if_null_index(meth.__name__) - return meth(df, *args, **kwargs) - - return guarded_meth - - # Inherits from pandas DataFrame so that we can use the same docstrings. @log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): @@ -261,7 +251,7 @@ def _sql_names( return results @property - @requires_index + @validations.requires_index def index( self, ) -> indexes.Index: @@ -277,7 +267,7 @@ def index(self, value): self.index.name = value.name if hasattr(value, "name") else None @property - @requires_index + @validations.requires_index def loc(self) -> indexers.LocDataFrameIndexer: return indexers.LocDataFrameIndexer(self) @@ -292,7 +282,7 @@ def iat(self) -> indexers.IatDataFrameIndexer: return indexers.IatDataFrameIndexer(self) @property - @requires_index + @validations.requires_index def at(self) -> indexers.AtDataFrameIndexer: return indexers.AtDataFrameIndexer(self) @@ -348,7 +338,7 @@ def _has_index(self) -> bool: def T(self) -> DataFrame: return DataFrame(self._get_block().transpose()) - @requires_index + @validations.requires_index @validations.requires_ordering() def transpose(self) -> DataFrame: return self.T @@ -417,7 +407,7 @@ def memory_usage(self, index: bool = True): column_sizes = pandas.concat([index_size, column_sizes]) return column_sizes - @requires_index + @validations.requires_index def info( self, verbose: Optional[bool] = None, @@ -1682,7 +1672,7 @@ def set_index( col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None] return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop)) - @requires_index + @validations.requires_index def sort_index( self, ascending: bool = True, na_position: Literal["first", "last"] = "last" ) -> DataFrame: @@ -1884,7 +1874,7 @@ def reindex( if columns is not None: return self._reindex_columns(columns) - @requires_index + @validations.requires_index def _reindex_rows( self, index, @@ -1931,12 +1921,12 @@ def _reindex_columns(self, columns): result_df.columns = new_column_index return result_df - @requires_index + @validations.requires_index def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None): return self.reindex(index=other.index, columns=other.columns, validate=validate) @validations.requires_ordering() - @requires_index + @validations.requires_index def interpolate(self, method: str = "linear") -> DataFrame: if method == "pad": return self.ffill() @@ -2231,12 +2221,12 @@ def agg( aggregate = agg aggregate.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.agg) - @requires_index + @validations.requires_index @validations.requires_ordering() def idxmin(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmin(self._block)) - @requires_index + @validations.requires_index @validations.requires_ordering() def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) @@ -2345,7 +2335,7 @@ def _pivot( ) return DataFrame(pivot_block) - @requires_index + @validations.requires_index @validations.requires_ordering() def pivot( self, @@ -2360,7 +2350,7 @@ def pivot( ) -> DataFrame: return self._pivot(columns=columns, index=index, values=values) - @requires_index + @validations.requires_index @validations.requires_ordering() def pivot_table( self, @@ -2460,7 +2450,7 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) - @requires_index + @validations.requires_index @validations.requires_ordering() def unstack(self, level: LevelsType = -1): if not utils.is_list_like(level): @@ -2711,7 +2701,7 @@ def groupby( else: raise TypeError("You have to supply one of 'by' and 'level'") - @requires_index + @validations.requires_index def _groupby_level( self, level: LevelsType, diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index f339345971..1daa1ea5ae 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,7 +14,6 @@ from __future__ import annotations -import functools import typing from typing import List, Sequence @@ -35,15 +34,6 @@ import bigframes.session -def requires_index(meth): - @functools.wraps(meth) - def guarded_meth(df: SeriesMethods, *args, **kwargs): - df._throw_if_null_index(meth.__name__) - return meth(df, *args, **kwargs) - - return guarded_meth - - class SeriesMethods: def __init__( self, diff --git a/bigframes/series.py b/bigframes/series.py index 7ba4858b5e..c2137bca35 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -52,7 +52,6 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.base -from bigframes.operations.base import requires_index import bigframes.operations.datetimes as dt import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings @@ -88,7 +87,7 @@ def dtypes(self): return self._dtype @property - @requires_index + @validations.requires_index def loc(self) -> bigframes.core.indexers.LocSeriesIndexer: return bigframes.core.indexers.LocSeriesIndexer(self) @@ -103,7 +102,7 @@ def iat(self) -> bigframes.core.indexers.IatSeriesIndexer: return bigframes.core.indexers.IatSeriesIndexer(self) @property - @requires_index + @validations.requires_index def at(self) -> bigframes.core.indexers.AtSeriesIndexer: return bigframes.core.indexers.AtSeriesIndexer(self) @@ -142,7 +141,7 @@ def values(self) -> numpy.ndarray: return self.to_numpy() @property - @requires_index + @validations.requires_index def index(self) -> indexes.Index: return indexes.Index.from_frame(self) @@ -245,7 +244,7 @@ def rename( raise ValueError(f"Unsupported type of parameter index: {type(index)}") - @requires_index + @validations.requires_index def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], @@ -404,12 +403,12 @@ def drop( block = block.drop_columns([condition_id]) return Series(block.select_column(self._value_column)) - @requires_index + @validations.requires_index def droplevel(self, level: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(level) return Series(self._block.drop_levels(resolved_level_ids)) - @requires_index + @validations.requires_index def swaplevel(self, i: int = -2, j: int = -1): level_i = self._block.index_columns[i] level_j = self._block.index_columns[j] @@ -419,7 +418,7 @@ def swaplevel(self, i: int = -2, j: int = -1): ] return Series(self._block.reorder_levels(reordering)) - @requires_index + @validations.requires_index def reorder_levels(self, order: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(order) return Series(self._block.reorder_levels(resolved_level_ids)) @@ -609,7 +608,7 @@ def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): return Series(block.select_column(result)) @validations.requires_ordering() - @requires_index + @validations.requires_index def interpolate(self, method: str = "linear") -> Series: if method == "pad": return self.ffill() @@ -1162,7 +1161,7 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) - @requires_index + @validations.requires_index def unstack(self, level: LevelsType = -1): if isinstance(level, int) or isinstance(level, str): level = [level] @@ -1186,7 +1185,7 @@ def unstack(self, level: LevelsType = -1): ) return bigframes.dataframe.DataFrame(pivot_block) - @requires_index + @validations.requires_index def idxmax(self) -> blocks.Label: block = self._block.order_by( [ @@ -1200,7 +1199,7 @@ def idxmax(self) -> blocks.Label: block = block.slice(0, 1) return indexes.Index(block).to_pandas()[0] - @requires_index + @validations.requires_index def idxmin(self) -> blocks.Label: block = self._block.order_by( [ @@ -1314,7 +1313,7 @@ def sort_values( ) return Series(block) - @requires_index + @validations.requires_index def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: # TODO(tbergeron): Support level parameter once multi-index introduced. if na_position not in ["first", "last"]: @@ -1377,7 +1376,7 @@ def groupby( else: raise TypeError("You have to supply one of 'by' and 'level'") - @requires_index + @validations.requires_index def _groupby_level( self, level: int | str | typing.Sequence[int] | typing.Sequence[str], @@ -1518,11 +1517,11 @@ def combine( materialized_series = result_series._cached() return materialized_series - @requires_index + @validations.requires_index def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) - @requires_index + @validations.requires_index def add_suffix(self, suffix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_suffix(suffix)) @@ -1574,7 +1573,7 @@ def filter( else: raise ValueError("Need to provide 'items', 'like', or 'regex'") - @requires_index + @validations.requires_index def reindex(self, index=None, *, validate: typing.Optional[bool] = None): if validate and not self.index.is_unique: raise ValueError("Original index must be unique to reindex") @@ -1603,7 +1602,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): )._block return Series(result_block) - @requires_index + @validations.requires_index def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None): return self.reindex(other.index, validate=validate) diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 25fbfbf988..0292d4880d 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -116,7 +116,15 @@ def generate_pandas_api_coverage(): """Inspect all our pandas objects, and compare with the real pandas objects, to see which methods we implement. For each, generate a regex that can be used to check if its present in a notebook""" - header = ["api", "pattern", "kind", "is_in_bigframes", "missing_parameters"] + header = [ + "api", + "pattern", + "kind", + "is_in_bigframes", + "missing_parameters", + "requires_index", + "requires_ordering", + ] api_patterns = [] indexers = ["loc", "iloc", "iat", "ix", "at"] for name, pandas_obj, bigframes_obj in PANDAS_TARGETS: @@ -156,6 +164,13 @@ def generate_pandas_api_coverage(): token_type = "property" is_in_bigframes = hasattr(bigframes_obj, member) + requires_index = False + requires_ordering = False + + if is_in_bigframes: + attr = getattr(bigframes_obj, member) + requires_index = hasattr(attr, "_validations_requires_index") + requires_ordering = hasattr(attr, "_validations_requires_ordering") api_patterns.append( [ @@ -164,6 +179,8 @@ def generate_pandas_api_coverage(): token_type, is_in_bigframes, missing_parameters, + requires_index, + requires_ordering, ] ) @@ -287,6 +304,7 @@ def generate_api_coverage(df, api_prefix): dataframe_apis["missing_parameters"].str.len() != 0 ) & dataframe_apis["is_in_bigframes"] not_implemented = ~dataframe_apis["is_in_bigframes"] + dataframe_table = pd.DataFrame( { "API": format_api( @@ -295,12 +313,16 @@ def generate_api_coverage(df, api_prefix): api_prefix, ), "Implemented": "", + "Requires index": "", + "Requires ordering": "", "Missing parameters": dataframe_apis["missing_parameters"], } ) dataframe_table.loc[fully_implemented, "Implemented"] = "Y" dataframe_table.loc[partial_implemented, "Implemented"] = "P" dataframe_table.loc[not_implemented, "Implemented"] = "N" + dataframe_table.loc[dataframe_apis["requires_index"], "Requires index"] = "Y" + dataframe_table.loc[dataframe_apis["requires_ordering"], "Requires ordering"] = "Y" return dataframe_table From 7050038eeee258452860941aa6b01d6a8ae10c6f Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:41:18 -0700 Subject: [PATCH 6/7] feat: add ml.llm.Claude3TextGenerator model (#901) * feat: add ml.llm.Claude3TextGenerator model * add in toc.yml * fix mypy * add models --- bigframes/ml/llm.py | 233 ++++++++++++++++++++++++++++++ bigframes/ml/loader.py | 5 + docs/templates/toc.yml | 2 + tests/system/conftest.py | 10 ++ tests/system/small/ml/test_llm.py | 65 ++++++++- 5 files changed, 313 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 45634423c6..35bcf0a33c 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -61,6 +61,17 @@ _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT, ) +_CLAUDE_3_SONNET_ENDPOINT = "claude-3-sonnet" +_CLAUDE_3_HAIKU_ENDPOINT = "claude-3-haiku" +_CLAUDE_3_5_SONNET_ENDPOINT = "claude-3-5-sonnet" +_CLAUDE_3_OPUS_ENDPOINT = "claude-3-opus" +_CLAUDE_3_ENDPOINTS = ( + _CLAUDE_3_SONNET_ENDPOINT, + _CLAUDE_3_HAIKU_ENDPOINT, + _CLAUDE_3_5_SONNET_ENDPOINT, + _CLAUDE_3_OPUS_ENDPOINT, +) + _ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" _ML_EMBED_TEXT_STATUS = "ml_embed_text_status" @@ -1020,3 +1031,225 @@ def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator: new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) + + +@log_adapter.class_logger +class Claude3TextGenerator(base.BaseEstimator): + """Claude3 text generator LLM model. + + Go to Google Cloud Console -> Vertex AI -> Model Garden page to enabe the models before use. Must have the Consumer Procurement Entitlement Manager Identity and Access Management (IAM) role to enable the models. + https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-partner-models#grant-permissions + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + + .. note:: + + The models only availabe in specific regions. Check https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#regions for details. + + Args: + model_name (str, Default to "claude-3-sonnet"): + The model for natural language tasks. Possible values are "claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet" and "claude-3-opus". + "claude-3-sonnet" is Anthropic's dependable combination of skills and speed. It is engineered to be dependable for scaled AI deployments across a variety of use cases. + "claude-3-haiku" is Anthropic's fastest, most compact vision and text model for near-instant responses to simple queries, meant for seamless AI experiences mimicking human interactions. + "claude-3-5-sonnet" is Anthropic's most powerful AI model and maintains the speed and cost of Claude 3 Sonnet, which is a mid-tier model. + "claude-3-opus" is Anthropic's second-most powerful AI model, with strong performance on highly complex tasks. + https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#available-claude-models + Default to "claude-3-sonnet". + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach + permission if the connection isn't fully set up. + """ + + def __init__( + self, + *, + model_name: Literal[ + "claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet", "claude-3-opus" + ] = "claude-3-sonnet", + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.model_name = model_name + self.session = session or bpd.get_global_session() + self._bq_connection_manager = self.session.bqconnectionmanager + + connection_name = connection_name or self.session._bq_connection + self.connection_name = clients.resolve_full_bq_connection_name( + connection_name, + default_project=self.session._project, + default_location=self.session._location, + ) + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + + if self._bq_connection_manager: + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + + if self.model_name not in _CLAUDE_3_ENDPOINTS: + raise ValueError( + f"Model name {self.model_name} is not supported. We only support {', '.join(_CLAUDE_3_ENDPOINTS)}." + ) + + options = { + "endpoint": self.model_name, + } + + return self._bqml_model_factory.create_remote_model( + session=self.session, connection_name=self.connection_name, options=options + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> Claude3TextGenerator: + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) + + model = cls( + **kwargs, + session=session, + model_name=model_endpoint, + connection_name=model_connection, + ) + model._bqml_model = core.BqmlModel(session, bq_model) + return model + + @property + def _bqml_options(self) -> dict: + """The model options as they will be set for BQML""" + options = { + "data_split_method": "NO_SPLIT", + } + return options + + def predict( + self, + X: Union[bpd.DataFrame, bpd.Series], + *, + max_output_tokens: int = 128, + top_k: int = 40, + top_p: float = 0.95, + ) -> bpd.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, which contains only one column of prompts. + Prompts can include preamble, questions, suggestions, instructions, or examples. + + max_output_tokens (int, default 128): + Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. + A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. + Default 128. Possible values are in the range [1, 4096]. + + top_k (int, default 40): + Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens + in the model's vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). + For each token selection step, the top K tokens with the highest probabilities are sampled. Then tokens are further filtered based on topP with the final token selected using temperature sampling. + Specify a lower value for less random responses and a higher value for more random responses. + Default 40. Possible values [1, 40]. + + top_p (float, default 0.95):: + Top-p changes how the model selects tokens for output. Tokens are selected from most K (see topK parameter) probable to least until the sum of their probabilities equals the top-p value. + For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-p value is 0.5, then the model will select either A or B as the next token (using temperature) + and not consider C at all. + Specify a lower value for less random responses and a higher value for more random responses. + Default 0.95. Possible values [0.0, 1.0]. + + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + + # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models + if max_output_tokens not in range(1, 4097): + raise ValueError( + f"max_output_token must be [1, 4096], but is {max_output_tokens}." + ) + + if top_k not in range(1, 41): + raise ValueError(f"top_k must be [1, 40], but is {top_k}.") + + if top_p < 0.0 or top_p > 1.0: + raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") + + (X,) = utils.convert_to_dataframe(X) + + if len(X.columns) != 1: + raise ValueError( + f"Only support one column as input. {constants.FEEDBACK_LINK}" + ) + + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "prompt"}) + + options = { + "max_output_tokens": max_output_tokens, + "top_k": top_k, + "top_p": top_p, + "flatten_json_output": True, + } + + df = self._bqml_model.generate_text(X, options) + + if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df + + def to_gbq(self, model_name: str, replace: bool = False) -> Claude3TextGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Determine whether to replace if the model already exists. Default to False. + + Returns: + Claude3TextGenerator: Saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index bd01342152..7d75f4c65a 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -63,6 +63,10 @@ llm._GEMINI_PRO_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, + llm._CLAUDE_3_HAIKU_ENDPOINT: llm.Claude3TextGenerator, + llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator, + llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator, + llm._CLAUDE_3_OPUS_ENDPOINT: llm.Claude3TextGenerator, llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator, } @@ -86,6 +90,7 @@ def from_bq( imported.XGBoostModel, llm.PaLM2TextGenerator, llm.PaLM2TextEmbeddingGenerator, + llm.Claude3TextGenerator, llm.TextEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 736ffba286..bab4ad9aac 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -157,6 +157,8 @@ uid: bigframes.ml.llm.PaLM2TextGenerator - name: PaLM2TextEmbeddingGenerator uid: bigframes.ml.llm.PaLM2TextEmbeddingGenerator + - name: Claude3TextGenerator + uid: bigframes.ml.llm.Claude3TextGenerator name: llm - items: - name: metrics diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 83c8baac39..05ff80dc33 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -145,6 +145,16 @@ def session() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time +@pytest.fixture(scope="session") +def session_us_east5() -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions( + location="us-east5", + ) + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup time + + @pytest.fixture(scope="session") def session_load() -> Generator[bigframes.Session, None, None]: context = bigframes.BigQueryOptions(location="US", project="bigframes-load-testing") diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index c2f62096d0..1647eb879f 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -18,7 +18,7 @@ from tests.system import utils -def test_create_text_generator_model( +def test_create_load_text_generator_model( palm2_text_generator_model, dataset_id, bq_connection ): # Model creation doesn't return error @@ -34,7 +34,7 @@ def test_create_text_generator_model( assert reloaded_model.connection_name == bq_connection -def test_create_text_generator_32k_model( +def test_create_load_text_generator_32k_model( palm2_text_generator_32k_model, dataset_id, bq_connection ): # Model creation doesn't return error @@ -405,6 +405,67 @@ def test_gemini_text_generator_predict_with_params_success( assert all(series.str.len() > 20) +# TODO(garrettwu): add tests for claude3.5 sonnet and claude3 opus as they are only available in other regions. +@pytest.mark.parametrize( + "model_name", + ("claude-3-sonnet", "claude-3-haiku"), +) +def test_claude3_text_generator_create_load( + dataset_id, model_name, session, bq_connection +): + claude3_text_generator_model = llm.Claude3TextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + assert claude3_text_generator_model is not None + assert claude3_text_generator_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = claude3_text_generator_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + assert reloaded_model.model_name == model_name + + +@pytest.mark.parametrize( + "model_name", + ("claude-3-sonnet", "claude-3-haiku"), +) +@pytest.mark.flaky(retries=2) +def test_claude3_text_generator_predict_default_params_success( + llm_text_df, model_name, session, bq_connection +): + claude3_text_generator_model = llm.Claude3TextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + df = claude3_text_generator_model.predict(llm_text_df).to_pandas() + assert df.shape == (3, 3) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) + + +@pytest.mark.parametrize( + "model_name", + ("claude-3-sonnet", "claude-3-haiku"), +) +@pytest.mark.flaky(retries=2) +def test_claude3_text_generator_predict_with_params_success( + llm_text_df, model_name, session, bq_connection +): + claude3_text_generator_model = llm.Claude3TextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + df = claude3_text_generator_model.predict( + llm_text_df, max_output_tokens=100, top_k=20, top_p=0.5 + ).to_pandas() + assert df.shape == (3, 3) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) + + @pytest.mark.flaky(retries=2) def test_llm_palm_score(llm_fine_tune_df_default_index): model = llm.PaLM2TextGenerator(model_name="text-bison") From e43e0e53c10b2f7629f04de3e00204ad150e3337 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:44:14 -0700 Subject: [PATCH 7/7] chore(main): release 1.15.0 (#902) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 15 +++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 754658c5e1..e77062dfa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,21 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.15.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.14.0...v1.15.0) (2024-08-20) + + +### Features + +* Add llm.TextEmbeddingGenerator to support new embedding models ([#905](https://github.com/googleapis/python-bigquery-dataframes/issues/905)) ([6bc6a41](https://github.com/googleapis/python-bigquery-dataframes/commit/6bc6a41426fbbb60e77cd77f80860f88a1751a4b)) +* Add ml.llm.Claude3TextGenerator model ([#901](https://github.com/googleapis/python-bigquery-dataframes/issues/901)) ([7050038](https://github.com/googleapis/python-bigquery-dataframes/commit/7050038eeee258452860941aa6b01d6a8ae10c6f)) + + +### Documentation + +* Add columns for "requires ordering/index" to supported APIs summary ([#892](https://github.com/googleapis/python-bigquery-dataframes/issues/892)) ([d2fc51a](https://github.com/googleapis/python-bigquery-dataframes/commit/d2fc51a30c4fff6fe0b98df61eec70ddb28b37ec)) +* Remove duplicate description for `kms_key_name` ([#898](https://github.com/googleapis/python-bigquery-dataframes/issues/898)) ([1053d56](https://github.com/googleapis/python-bigquery-dataframes/commit/1053d56260eef1cff6e7c419f6c86be8f7e74373)) +* Update embedding model notebooks ([#906](https://github.com/googleapis/python-bigquery-dataframes/issues/906)) ([d9b8ef5](https://github.com/googleapis/python-bigquery-dataframes/commit/d9b8ef56deb0c776edeeb0112bd9d35d5ed1b70e)) + ## [1.14.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.13.0...v1.14.0) (2024-08-14) diff --git a/bigframes/version.py b/bigframes/version.py index 2e135689ed..f0f332d182 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.14.0" +__version__ = "1.15.0"