From aa4be7910bf09f16cc3dd1a2ca39d895d2750f07 Mon Sep 17 00:00:00 2001 From: Andrew Gorcester Date: Fri, 16 Mar 2018 14:10:30 -0700 Subject: [PATCH 1/4] Fully update inspect_content and redact DLP samples --- dlp/inspect_content.py | 385 +++++++++++++++++++++++++++++++++++- dlp/inspect_content_test.py | 103 ++++++++++ dlp/redact.py | 111 +---------- dlp/redact_test.py | 29 --- dlp/requirements.txt | 4 +- 5 files changed, 494 insertions(+), 138 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index f99e40db57c..4fb45bb34b6 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -21,7 +21,7 @@ import os -# [START inspect_string] +# [START dlp_inspect_string] def inspect_string(project, content_string, info_types, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. @@ -80,10 +80,10 @@ def inspect_string(project, content_string, info_types, print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.') -# [END inspect_string] +# [END dlp_inspect_string] -# [START inspect_file] +# [START dlp_inspect_file] def inspect_file(project, filename, info_types, min_likelihood=None, max_findings=None, include_quote=True, mime_type=None): """Uses the Data Loss Prevention API to analyze a file for protected data. @@ -163,10 +163,10 @@ def inspect_file(project, filename, info_types, min_likelihood=None, print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.') -# [END inspect_file] +# [END dlp_inspect_file] -# [START inspect_gcs_file] +# [START dlp_inspect_gcs] def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, info_types, min_likelihood=None, max_findings=None, timeout=300): @@ -192,6 +192,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, None; the response from the API is printed to the terminal. """ + # Import the client library. import google.cloud.dlp @@ -219,7 +220,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, 'limits': {'max_findings_per_request': max_findings}, } - # Construct a cloud_storage_options dictionary with the file's URL. + # Construct a storage_config containing the file's URL. url = 'gs://{}/{}'.format(bucket, filename) storage_config = { 'cloud_storage_options': { @@ -288,7 +289,266 @@ def callback(message): print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.') -# [END inspect_gcs_file] +# [END dlp_inspect_gcs] + + +# [START dlp_inspect_datastore] +def inspect_datastore(project, datastore_project, kind, + topic_id, subscription_id, info_types, namespace_id=None, + min_likelihood=None, max_findings=None, timeout=300): + """Uses the Data Loss Prevention API to analyze Datastore data. + Args: + project: The Google Cloud project id to use as a parent resource. + datastore_project: The Google Cloud project id of the target Datastore. + kind: The kind of the Datastore entity to inspect, e.g. 'Person'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] + info_types = [{'name': info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + 'limits': {'max_findings_per_request': max_findings}, + } + + # Construct a storage_config containing the target Datastore info. + storage_config = { + 'datastore_options': { + 'partition_id': { + 'project_id': datastore_project, + 'namespace_id': namespace_id, + }, + 'kind': { + 'name': kind + }, + } + } + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Tell the API where to send a notification when the job is complete. + actions = [{ + 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} + }] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + 'inspect_config': inspect_config, + 'storage_config': storage_config, + 'actions': actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path( + project, subscription_id) + subscription = subscriber.subscribe(subscription_path) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if (message.attributes['DlpJobName'] == operation.name): + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print('Info type: {}; Count: {}'.format( + finding.info_type.name, finding.count)) + else: + print('No findings.') + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscription.open(callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print('No event received before the timeout. Please verify that the ' + 'subscription provided is subscribed to the topic provided.') + +# [END dlp_inspect_datastore] + + +# [START dlp_inspect_bigquery] +def inspect_bigquery(project, bigquery_project, dataset_id, table_id, + topic_id, subscription_id, info_types, + min_likelihood=None, max_findings=None, timeout=300): + """Uses the Data Loss Prevention API to analyze BigQuery data. + Args: + project: The Google Cloud project id to use as a parent resource. + bigquery_project: The Google Cloud project id of the target table. + dataset_id: The id of the target BigQuery dataset. + table_id: The id of the target BigQuery table. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] + info_types = [{'name': info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + 'limits': {'max_findings_per_request': max_findings}, + } + + # Construct a storage_config containing the target Bigquery info. + storage_config = { + 'big_query_options': { + 'table_reference': { + 'project_id': bigquery_project, + 'dataset_id': dataset_id, + 'table_id': table_id, + } + } + } + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Tell the API where to send a notification when the job is complete. + actions = [{ + 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} + }] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + 'inspect_config': inspect_config, + 'storage_config': storage_config, + 'actions': actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path( + project, subscription_id) + subscription = subscriber.subscribe(subscription_path) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if (message.attributes['DlpJobName'] == operation.name): + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print('Info type: {}; Count: {}'.format( + finding.info_type.name, finding.count)) + else: + print('No findings.') + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscription.open(callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print('No event received before the timeout. Please verify that the ' + 'subscription provided is subscribed to the topic provided.') + +# [END dlp_inspect_bigquery] if __name__ == '__main__': @@ -404,6 +664,100 @@ def callback(message): 'API. The default is 300 seconds.', default=300) + parser_datastore = subparsers.add_parser( + 'datastore', help='Inspect files on Google Datastore.') + parser_datastore.add_argument( + 'datastore_project', + help='The Google Cloud project id of the target Datastore.') + parser_datastore.add_argument( + 'kind', + help='The kind of the Datastore entity to inspect, e.g. "Person".') + parser_datastore.add_argument( + 'topic_id', + help='The id of the Cloud Pub/Sub topic to use to report that the job ' + 'is complete, e.g. "dlp-sample-topic".') + parser_datastore.add_argument( + 'subscription_id', + help='The id of the Cloud Pub/Sub subscription to monitor for job ' + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + 'already be subscribed to the topic. See the test files or the Cloud ' + 'Pub/Sub sample files for examples on how to create the subscription.') + parser_datastore.add_argument( + '--project', + help='The Google Cloud project id to use as a parent resource.', + default=default_project) + parser_datastore.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_datastore.add_argument( + '--namespace_id', + help='The Datastore namespace to use, if applicable.') + parser_datastore.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_datastore.add_argument( + '--max_findings', type=int, + help='The maximum number of findings to report; 0 = no maximum.') + parser_datastore.add_argument( + '--timeout', type=int, + help='The maximum number of seconds to wait for a response from the ' + 'API. The default is 300 seconds.', + default=300) + + parser_bigquery = subparsers.add_parser( + 'bigquery', help='Inspect files on Google BigQuery.') + parser_bigquery.add_argument( + 'bigquery_project', + help='The Google Cloud project id of the target table.') + parser_bigquery.add_argument( + 'dataset_id', + help='The ID of the target BigQuery dataset.') + parser_bigquery.add_argument( + 'table_id', + help='The ID of the target BigQuery table.') + parser_bigquery.add_argument( + 'topic_id', + help='The id of the Cloud Pub/Sub topic to use to report that the job ' + 'is complete, e.g. "dlp-sample-topic".') + parser_bigquery.add_argument( + 'subscription_id', + help='The id of the Cloud Pub/Sub subscription to monitor for job ' + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + 'already be subscribed to the topic. See the test files or the Cloud ' + 'Pub/Sub sample files for examples on how to create the subscription.') + parser_bigquery.add_argument( + '--project', + help='The Google Cloud project id to use as a parent resource.', + default=default_project) + parser_bigquery.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_bigquery.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_bigquery.add_argument( + '--max_findings', type=int, + help='The maximum number of findings to report; 0 = no maximum.') + parser_bigquery.add_argument( + '--timeout', type=int, + help='The maximum number of seconds to wait for a response from the ' + 'API. The default is 300 seconds.', + default=300) + args = parser.parse_args() if args.content == 'string': @@ -427,3 +781,20 @@ def callback(message): min_likelihood=args.min_likelihood, max_findings=args.max_findings, timeout=args.timeout) + elif args.content == 'datastore': + inspect_datastore( + args.project, args.datastore_project, args.kind, + args.topic_id, args.subscription_id, + args.info_types, + namespace_id=args.namespace_id, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout) + elif args.content == 'bigquery': + inspect_bigquery( + args.project, args.bigquery_project, args.dataset_id, + args.table_id, args.topic_id, args.subscription_id, + args.info_types, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index 62d0770c9f2..96f09a2c11d 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -15,6 +15,8 @@ import os import google.api_core.exceptions +import google.cloud.bigquery +import google.cloud.datastore import google.cloud.exceptions import google.cloud.pubsub import google.cloud.storage @@ -30,6 +32,9 @@ RESOURCE_FILE_NAMES = ['test.txt', 'test.png', 'harmless.txt', 'accounts.txt'] TOPIC_ID = 'dlp-test' SUBSCRIPTION_ID = 'dlp-test-subscription' +DATASTORE_KIND = 'DLP test kind' +BIGQUERY_DATASET_ID = 'dlp_test_dataset' +BIGQUERY_TABLE_ID = 'dlp_test_table' @pytest.fixture(scope='module') @@ -94,6 +99,61 @@ def subscription_id(topic_id): subscriber.delete_subscription(subscription_path) +@pytest.fixture(scope='module') +def datastore_project(): + # Adds test Datastore data, yields the project ID and then tears down. + datastore_client = google.cloud.datastore.Client() + + kind = DATASTORE_KIND + name = 'DLP test object' + key = datastore_client.key(kind, name) + item = google.cloud.datastore.Entity(key=key) + item['payload'] = 'My name is Gary Smith and my email is gary@example.com' + + datastore_client.put(item) + + yield GCLOUD_PROJECT + + datastore_client.delete(key) + + +@pytest.fixture(scope='module') +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + # DO NOT SUBMIT: trim this down once we find out what works + table.schema = ( + google.cloud.bigquery.SchemaField('Name', 'STRING'), + google.cloud.bigquery.SchemaField('Comment', 'STRING'), + ) + + try: + table = bigquery_client.create_table(table) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + rows_to_insert = [ + (u'Gary Smith', u'My email is gary@example.com',) + ] + + bigquery_client.insert_rows(table, rows_to_insert) + + yield GCLOUD_PROJECT + + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + def test_inspect_string(capsys): test_string = 'My name is Gary Smith and my email is gary@example.com' @@ -212,3 +272,46 @@ def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): out, _ = capsys.readouterr() assert 'Info type: EMAIL_ADDRESS' in out assert 'Info type: PHONE_NUMBER' in out + + +def test_inspect_datastore( + datastore_project, topic_id, subscription_id, capsys): + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER']) + + out, _ = capsys.readouterr() + assert 'Info type: EMAIL_ADDRESS' in out + + +def test_inspect_datastore_no_results( + datastore_project, topic_id, subscription_id, capsys): + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ['PHONE_NUMBER']) + + out, _ = capsys.readouterr() + assert 'No findings' in out + + +def test_inspect_bigquery( + bigquery_project, topic_id, subscription_id, capsys): + inspect_content.inspect_bigquery( + GCLOUD_PROJECT, + bigquery_project, + BIGQUERY_DATASET_ID, + BIGQUERY_TABLE_ID, + topic_id, + subscription_id, + ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER']) + + out, _ = capsys.readouterr() + assert 'Info type: FIRST_NAME' in out diff --git a/dlp/redact.py b/dlp/redact.py index 678999d2cb4..7d7a5379293 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -22,67 +22,6 @@ import os -# [START redact_string] -def redact_string(item, replace_string, info_types=None, min_likelihood=None): - """Uses the Data Loss Prevention API to redact protected data in a string. - Args: - item: The string to inspect. - replace_string: The string to use to replace protected data; for - instance, '***' or 'REDACTED'. An empty string is permitted. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. If - info_types is omitted, the API will use a limited default set. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp_v2beta1 - - # Instantiate a client. - dlp = google.cloud.dlp_v2beta1.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - if info_types is not None: - info_types = [{'name': info_type} for info_type in info_types] - - # Prepare replace_configs, a list of dictionaries. Each dictionary contains - # an info_type and the string to which that info_type will be redacted upon - # detection. This sample uses the same "replace_string" for all info types, - # though the API supports using different ones for each type. - replace_configs = [] - - if info_types is not None: - for info_type in info_types: - replace_configs.append( - {'info_type': info_type, - 'replace_with': replace_string}) - else: - # If no info_type is specified, prepare a single dictionary with only a - # replace_string as a catch-all. - replace_configs.append({'replace_with': replace_string}) - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - redact_config = { - 'info_types': info_types, - 'min_likelihood': min_likelihood, - } - - # Construct the items list (in this case, only one item, in string form). - items = [{'type': 'text/plain', 'value': item}] - - # Call the API. - response = dlp.redact_content(redact_config, items, replace_configs) - - # Print out the results. - print(response.items[0].value) -# [END redact_string] - - # [START redact_image] def redact_image(project, filename, output_filename, info_types, min_likelihood=None, mime_type=None): @@ -168,67 +107,37 @@ def redact_image(project, filename, output_filename, default_project = os.environ.get('GCLOUD_PROJECT') parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest='content', help='Select how to submit content to the API.') - subparsers.required = True - - parser_string = subparsers.add_parser('string', help='Redact a string.') - parser_string.add_argument('item', help='The string to inspect.') - parser_string.add_argument( - 'replace_string', - help='The string to use to replace protected data; for instance, ' - '"***" or "REDACTED".') - parser_string.add_argument( - '--info_types', action='append', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", ' - '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, ' - 'the API will use a limited default set. Specify this flag ' - 'multiple times to specify multiple info types.') - parser_string.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') - parser_file = subparsers.add_parser('image', help='Redact an image file.') - parser_file.add_argument( + parser.add_argument( 'filename', help='The path to the file to inspect.') - parser_file.add_argument( + parser.add_argument( 'output_filename', help='The path to which the redacted image will be written.') - parser_file.add_argument( + parser.add_argument( '--project', help='The Google Cloud project id to use as a parent resource.', default=default_project) - parser_file.add_argument( + parser.add_argument( '--info_types', action='append', help='Strings representing info types to look for. A full list of ' 'info categories and types is available from the API. Examples ' 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) - parser_file.add_argument( + parser.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], help='A string representing the minimum likelihood threshold that ' 'constitutes a match.') - parser_file.add_argument( + parser.add_argument( '--mime_type', help='The MIME type of the file. If not specified, the type is ' 'inferred via the Python standard library\'s mimetypes module.') args = parser.parse_args() - if args.content == 'string': - redact_string( - args.item, args.replace_string, info_types=args.info_types, - min_likelihood=args.min_likelihood) - elif args.content == 'image': - redact_image( - args.project, args.filename, args.output_filename, - args.info_types, min_likelihood=args.min_likelihood, - mime_type=args.mime_type) + redact_image( + args.project, args.filename, args.output_filename, + args.info_types, min_likelihood=args.min_likelihood, + mime_type=args.mime_type) diff --git a/dlp/redact_test.py b/dlp/redact_test.py index 2c95606072b..50eb826b051 100644 --- a/dlp/redact_test.py +++ b/dlp/redact_test.py @@ -31,35 +31,6 @@ def tempdir(): shutil.rmtree(tempdir) -def test_redact_string(capsys): - test_string = 'I am Gary and my email is gary@example.com' - - redact.redact_string(test_string, 'REDACTED') - - out, _ = capsys.readouterr() - assert 'REDACTED' in out - - -def test_redact_string_with_info_types(capsys): - test_string = 'My email is gary@example.com and my number is 206-555-5555' - - redact.redact_string( - test_string, 'REDACTED', info_types=['PHONE_NUMBER']) - - out, _ = capsys.readouterr() - assert 'REDACTED' in out - assert out.count('REDACTED') == 1 - - -def test_redact_string_no_findings(capsys): - test_string = 'Nothing to see here' - - redact.redact_string(test_string, 'REDACTED') - - out, _ = capsys.readouterr() - assert 'REDACTED' not in out - - def test_redact_image_file(tempdir, capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') output_filepath = os.path.join(tempdir, 'redacted.png') diff --git a/dlp/requirements.txt b/dlp/requirements.txt index b973c95c668..f240b598378 100644 --- a/dlp/requirements.txt +++ b/dlp/requirements.txt @@ -1,3 +1,5 @@ google-cloud-dlp==0.1.1 google-cloud-storage==1.8.0 -google.cloud.pubsub==0.32.1 +google-cloud-pubsub==0.32.1 +google-cloud-datastore==1.6.0 +google-cloud-bigquery==0.31.0 From 64e419fb155be87b54d43a0dda552e04c8c849e0 Mon Sep 17 00:00:00 2001 From: Andrew Gorcester Date: Fri, 16 Mar 2018 14:12:30 -0700 Subject: [PATCH 2/4] fix region tags for redact image --- dlp/redact.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlp/redact.py b/dlp/redact.py index 7d7a5379293..5757782fa24 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -22,7 +22,7 @@ import os -# [START redact_image] +# [START dlp_redact_image] def redact_image(project, filename, output_filename, info_types, min_likelihood=None, mime_type=None): """Uses the Data Loss Prevention API to redact protected data in an image. @@ -100,7 +100,7 @@ def redact_image(project, filename, output_filename, f.write(response.redacted_image) print("Wrote {byte_count} to {filename}".format( byte_count=len(response.redacted_image), filename=output_filename)) -# [END redact_string] +# [END dlp_redact_image] if __name__ == '__main__': From 1c3965c8aa967a8b2e6269f0fb8daca1296cb6db Mon Sep 17 00:00:00 2001 From: Andrew Gorcester Date: Fri, 16 Mar 2018 14:19:17 -0700 Subject: [PATCH 3/4] Fix redact docstring --- dlp/redact.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dlp/redact.py b/dlp/redact.py index 5757782fa24..9e0863374e9 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -27,6 +27,7 @@ def redact_image(project, filename, output_filename, info_types, min_likelihood=None, mime_type=None): """Uses the Data Loss Prevention API to redact protected data in an image. Args: + project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. output_filename: The path to which the redacted image will be written. info_types: A list of strings representing info types to look for. From 52779238e480679f10ad2eb4976dafeed13d0e43 Mon Sep 17 00:00:00 2001 From: Andrew Gorcester Date: Fri, 16 Mar 2018 14:23:12 -0700 Subject: [PATCH 4/4] Fix the redact image docstring AGAIN --- dlp/redact.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dlp/redact.py b/dlp/redact.py index 9e0863374e9..85fb9ef6458 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -31,8 +31,7 @@ def redact_image(project, filename, output_filename, filename: The path to the file to inspect. output_filename: The path to which the redacted image will be written. info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. If - info_types is omitted, the API will use a limited default set. + A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.